In [None]:
!pip install contractions

In [None]:
import re
import pickle
import string
import contractions
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

#2.Data Collection

In [None]:
''' Loading Dataset 1'''
train1 = pd.read_csv('/content/drive/MyDrive/train_1.csv')
print('----- Train Set 1 -----')
train1.head()

In [None]:
test1 = pd.read_csv('/content/drive/MyDrive/test_1.csv')
print('----- Test Set 1 -----')
test1.head()

In [None]:
''' Loading dataset 2 '''
df = pd.read_csv('/content/drive/MyDrive/dataset2.csv')
print('----- Dataset 2 -----')
df.head()

In [None]:
df = df[['index','oh_label','Text']]
df.head()

In [None]:
''' Renaming Column names to follow a single naming convention '''
# Before renaming the Columns
print("\nBefore modifying column names:\n", df.columns)

df.rename(columns = {'index':'id','oh_label':'label','Text':'tweet'}, inplace = True)

# After renaming the columns
print("\nAfter modifying first column:\n", df.columns)

In [None]:
'''Spliting the dataset 2 into training and testing dataset'''

train2, test2 = train_test_split(df, test_size=0.3, random_state=10, shuffle=True)

train2 = train2[['id', 'label', 'tweet']]
test2 = test2[['id', 'tweet']]

In [None]:
print('----- Train Set 2 -----')
train2

In [None]:
print('----- Test Set 2 -----')
print(test2)

In [None]:
# Merging two Train Data Sets
train = pd.concat([train1, train2], ignore_index = True)
print(train)

In [None]:
train['label'].value_counts()

In [None]:
# Merging two Test Data Sets
test = pd.concat([test1,test2], ignore_index = True)
print(test)

# 3.Data Preprocessing

## 3.1 Cleaning the Data

In [None]:
def clean_text(df, text_field):
    # Convert text to lowercase
    df[text_field] = df[text_field].str.lower()

    # Remove full URLs
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"\w+:\/\/\S+", "", elem))

    # Remove strings starting with 'http'
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'http\S+', '', elem))

    # Remove square brackets content
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'\[.*?\]', '', elem))

    # Remove parentheses content
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'\(.*?\)', '', elem))

    # Remove hashtags
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'#', ' ', elem))

    # Remove mentions
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'@[^\s]+', '', elem))

    # Remove 'rt' at the beginning of the string
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'^rt', '', elem))

    # Fix contractions
    df[text_field] = df[text_field].apply(lambda elem: contractions.fix(elem))

    # Remove punctuations
    df[text_field] = df[text_field].apply(lambda elem: re.sub('[%s]' % re.escape(string.punctuation), ' ', elem))

    # Remove digits
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'\w*\d\w*', '', elem))

    # Remove new line characters
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'\n', ' ', elem))
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'\\n', ' ', elem))

    # Remove quotation marks
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"[''\"“”‘’…]", '', elem))

    # Remove HTML attributes
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'<[^>]+>', '', elem))

    # Remove non-English languages
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'[^a-zA-Z\s]', '', elem))

    # Remove emojis and symbols
    df[text_field] = df[text_field].apply(lambda elem: re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE).sub(r'', elem))

    # Strip whitespace
    df[text_field] = df[text_field].str.strip()

    return df

## 3.2 Upsampling

In [None]:
'''Data Cleaning and upsampling to balance the class distribution'''
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]
train_minority_upsampled = resample(train_minority,
                                 replace=True,
                                 n_samples=len(train_majority),
                                 random_state=123)
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

train_upsampled

# 4.Data Visualisation

## 4.1 WordCloud Before Upsampling

In [None]:
'''Data Visualisation using WordCloud (use jupyter notebook for Visualisation)'''
fig, axs = plt.subplots(1,2 , figsize=(16,8))
text_pos = " ".join(train_clean['tweet'][train.label == 0])
text_neg = " ".join(train_clean['tweet'][train.label == 1])
train_cloud_pos = WordCloud(collocations = False, background_color = 'white').generate(text_pos)
train_cloud_neg = WordCloud(collocations = False, background_color = 'black').generate(text_neg)
axs[0].imshow(train_cloud_pos, interpolation='bilinear')
axs[0].axis('off')
axs[0].set_title('Non-Hate Comments')
axs[1].imshow(train_cloud_neg, interpolation='bilinear')
axs[1].axis('off')
axs[1].set_title('Hate Comments')

plt.show()

## 4.2 Histogram (Class distribution BEFORE and AFTER Upsampling)

In [None]:
'''histogram to show class distribution before and after upsampling'''
plt.figure(figsize=(16,8))
sns.set_style('darkgrid')
sns.histplot(data = train['label'], color='black', legend=True)
sns.histplot(data = train_upsampled['label'], color = 'orange', legend=True)
plt.legend(['Initial_Data', 'Resampled_Data'])
plt.show()

## 4.3 WordCloud after Upsampling

In [None]:
print('--------------After Upsampling the Minority Class---------------')

fig, axs = plt.subplots(1,2 , figsize=(16,8))
text_pos = " ".join(train_upsampled['tweet'][train.label == 0])
text_neg = " ".join(train_upsampled['tweet'][train.label == 1])
train_cloud_pos = WordCloud(collocations = False, background_color = 'white').generate(text_pos)
train_cloud_neg = WordCloud(collocations = False, background_color = 'black').generate(text_neg)
axs[0].imshow(train_cloud_pos, interpolation='bilinear')
axs[0].axis('off')
axs[0].set_title('Non-Hate Comments')
axs[1].imshow(train_cloud_neg, interpolation='bilinear')
axs[1].axis('off')
axs[1].set_title('Hate Comments')

plt.show()

# 5.Feature Representation

## 5.1 CountVectorizer (CV)

In [None]:
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(train_upsampled['tweet']).toarray()

In [None]:
print(X)

In [None]:
# Extracting the 'label' column as the target variable
y = train_upsampled['label'].values
y = y.reshape(-1, 1)
y_df = pd.DataFrame(y)
y = np.array(y_df[0])

# 6.Model Selection

In [None]:
nb = MultinomialNB()

In [None]:
svm = SVC(kernel='linear')

In [None]:
lr = LogisticRegression()

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
lgbm = lgb.LGBMClassifier()

# 7.Model Training

## 7.1 Spliting the Dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1)

## 7.2 Training the Model

In [None]:
# Naive Bayes
nb.fit(X_train, y_train)

In [None]:
# SVM
svm.fit(X_train, y_train)

In [None]:
# Logistic Regression
lr.fit(X_train, y_train)

In [None]:
# Random Forest
rf.fit(X_train, y_train)

In [None]:
# LightGBM
lgbm.fit(X_train, y_train)

## 7.3 Model Evaluation (Training)



* Naive Bayes Classifier

In [None]:
nb_train_pred = nb.predict(X_train)
nb_train_pred_proba = nb.predict_proba(X_train)[:, 1]
nb_train_accuracy = accuracy_score(y_train, nb_train_pred)
nb_train_precision = precision_score(y_train, nb_train_pred)
nb_train_recall = recall_score(y_train, nb_train_pred)
nb_train_f1 = f1_score(y_train, nb_train_pred)
nb_train_roc_auc = roc_auc_score(y_train, nb_train_pred_proba)
nb_train_cm = confusion_matrix(y_train, nb_train_pred)
nb_train_report = classification_report(y_train, nb_train_pred)


print("Naive Bayes:")
print("Training Metrics:")
print(f"Accuracy: {nb_train_accuracy:.4f}")
print(f"Precision: {nb_train_precision:.4f}")
print(f"Recall: {nb_train_recall:.4f}")
print(f"F1-score: {nb_train_f1:.4f}")
print(f"ROC AUC Score: {nb_train_roc_auc:.4f}")
print("Confusion Matrix:")
print(nb_train_cm)
print("Training Classification Report:")
print(nb_train_report)

* Support Vector Machine

In [None]:
svm_train_pred = svm.predict(X_train)
svm_test_pred_proba = svm.predict_proba(X_test)[:, 1]
svm_train_accuracy = accuracy_score(y_train, svm_train_pred)
svm_train_precision = precision_score(y_train, svm_train_pred)
svm_train_recall = recall_score(y_train, svm_train_pred)
svm_train_f1 = f1_score(y_train, svm_train_pred)
svm_train_roc_auc = roc_auc_score(y_train, svm_train_pred_proba)
svm_train_cm = confusion_matrix(y_train, svm_train_pred)
svm_train_report = classification_report(y_train, svm_train_pred)

print("SVM:")
print("Training Metrics:")
print(f"Accuracy: {svm_train_accuracy:.4f}")
print(f"Precision: {svm_train_precision:.4f}")
print(f"Recall: {svm_train_recall:.4f}")
print(f"F1-score: {svm_train_f1:.4f}")
print(f"ROC AUC Score: {svm_train_roc_auc:.4f}")
print("Confusion Matrix:")
print(svm_train_cm)
print("Training Classification Report:")
print(svm_train_report)

* Logistic Regression

In [None]:
lr_train_pred = lr.predict(X_train)
lr_train_pred_proba = lr.predict_proba(X_train)[:, 1]
lr_train_accuracy = accuracy_score(y_train, lr_train_pred)
lr_train_precision = precision_score(y_train, lr_train_pred)
lr_train_recall = recall_score(y_train, lr_train_pred)
lr_train_f1 = f1_score(y_train, lr_train_pred)
lr_train_roc_auc = roc_auc_score(y_train, lr_train_pred_proba)
lr_train_cm = confusion_matrix(y_train, lr_train_pred)
lr_train_report = classification_report(y_train, lr_train_pred)

print("Logistic Regression:")
print("Training Metrics:")
print(f"Accuracy: {lr_train_accuracy:.4f}")
print(f"Precision: {lr_train_precision:.4f}")
print(f"Recall: {lr_train_recall:.4f}")
print(f"F1-score: {lr_train_f1:.4f}")
print(f"ROC AUC Score: {lr_train_roc_auc:.4f}")
print("Confusion Matrix:")
print(lr_train_cm)
print("Training Classification Report:")
print(lr_train_report)

* Random Forest

In [None]:
rf_train_pred = rf.predict(X_train)
rf_train_pred_proba = rf.predict_proba(X_train)[:, 1]
rf_train_accuracy = accuracy_score(y_train, rf_train_pred)
rf_train_precision = precision_score(y_train, rf_train_pred)
rf_train_recall = recall_score(y_train, rf_train_pred)
rf_train_f1 = f1_score(y_train, rf_train_pred)
rf_train_roc_auc = roc_auc_score(y_train, rf_train_pred_proba)
rf_train_cm = confusion_matrix(y_train, rf_train_pred)
rf_train_report = classification_report(y_train, rf_train_pred)

print("Random Forest:")
print("Training Metrics:")
print(f"Accuracy: {rf_train_accuracy:.4f}")
print(f"Precision: {rf_train_precision:.4f}")
print(f"Recall: {rf_train_recall:.4f}")
print(f"F1-score: {rf_train_f1:.4f}")
print(f"ROC AUC Score: {rf_train_roc_auc:.4f}")
print("Confusion Matrix:")
print(rf_train_cm)
print("Training Classification Report:")
print(rf_train_report)

* LightGBM

In [None]:
lgbm_train_pred = lgbm.predict(X_train)
lgbm_train_pred_proba = lgbm.predict_proba(X_train)[:, 1]
lgbm_train_accuracy = accuracy_score(y_train, lgbm_train_pred)
lgbm_train_precision = precision_score(y_train, lgbm_train_pred)
lgbm_train_recall = recall_score(y_train, lgbm_train_pred)
lgbm_train_f1 = f1_score(y_train, lgbm_train_pred)
lgbm_train_roc_auc = roc_auc_score(y_train, lgbm_train_pred_proba)
lgbm_train_cm = confusion_matrix(y_train, lgbm_train_pred)
lgbm_train_report = classification_report(y_train, lgbm_train_pred)

print("LightGBM:")
print("Training Metrics:")
print(f"Accuracy: {lgbm_train_accuracy:.4f}")
print(f"Precision: {lgbm_train_precision:.4f}")
print(f"Recall: {lgbm_train_recall:.4f}")
print(f"F1-score: {lgbm_train_f1:.4f}")
print(f"ROC AUC Score: {lgbm_train_roc_auc:.4f}")
print("Confusion Matrix:")
print(lgbm_train_cm)
print("Training Classification Report:")
print(lgbm_train_report)

# 8.Model Evaluation (Testing Set)



*   Naive Bayes Classifier



In [None]:
nb_test_pred = nb.predict(X_test)
nb_test_pred_proba = nb.predict_proba(X_test)[:, 1]
nb_test_accuracy = accuracy_score(y_test, nb_test_pred)
nb_test_precision = precision_score(y_test, nb_test_pred)
nb_test_recall = recall_score(y_test, nb_test_pred)
nb_test_f1 = f1_score(y_test, nb_test_pred)
nb_test_roc_auc = roc_auc_score(y_test, nb_test_pred_proba)
nb_test_cm = confusion_matrix(y_test, nb_test_pred)
nb_test_report = classification_report(y_test, nb_test_pred)

print("Naive Bayes:")
print("\nTesting Metrics:")
print(f"Accuracy: {nb_test_accuracy:.4f}")
print(f"Precision: {nb_test_precision:.4f}")
print(f"Recall: {nb_test_recall:.4f}")
print(f"F1-score: {nb_test_f1:.4f}")
print(f"ROC AUC Score: {nb_test_roc_auc:.4f}")
print("Confusion Matrix:")
print(nb_test_cm)
print("Testing Classification Report:")
print(nb_test_report)

* Support Vector Machine

In [None]:
svm_test_pred = svm.predict(X_test)
svm_train_pred_proba = svm.predict_proba(X_train)[:, 1]
svm_test_accuracy = accuracy_score(y_test, svm_test_pred)
svm_test_precision = precision_score(y_test, svm_test_pred)
svm_test_recall = recall_score(y_test, svm_test_pred)
svm_test_f1 = f1_score(y_test, svm_test_pred)
svm_test_roc_auc = roc_auc_score(y_test, svm_test_pred_proba)
svm_test_cm = confusion_matrix(y_test, svm_test_pred)
svm_test_report = classification_report(y_test, svm_test_pred)

print("SVM:")
print("\nTesting Metrics:")
print(f"Accuracy: {svm_test_accuracy:.4f}")
print(f"Precision: {svm_test_precision:.4f}")
print(f"Recall: {svm_test_recall:.4f}")
print(f"F1-score: {svm_test_f1:.4f}")
print(f"ROC AUC Score: {svm_test_roc_auc:.4f}")
print("Confusion Matrix:")
print(svm_test_cm)
print("Testing Classification Report:")
print(svm_test_report)

* Logistic Regression

In [None]:
lr_test_pred = lr.predict(X_test)
lr_test_pred_proba = lr.predict_proba(X_test)[:, 1]
lr_test_accuracy = accuracy_score(y_test, lr_test_pred)
lr_test_precision = precision_score(y_test, lr_test_pred)
lr_test_recall = recall_score(y_test, lr_test_pred)
lr_test_f1 = f1_score(y_test, lr_test_pred)
lr_test_roc_auc = roc_auc_score(y_test, lr_test_pred_proba)
lr_test_cm = confusion_matrix(y_test, lr_test_pred)
lr_test_report = classification_report(y_test, lr_test_pred)

print("Logistic Regression:")
print("\nTesting Metrics:")
print(f"Accuracy: {lr_test_accuracy:.4f}")
print(f"Precision: {lr_test_precision:.4f}")
print(f"Recall: {lr_test_recall:.4f}")
print(f"F1-score: {lr_test_f1:.4f}")
print(f"ROC AUC Score: {lr_test_roc_auc:.4f}")
print("Confusion Matrix:")
print(lr_test_cm)
print("Testing Classification Report:")
print(lr_test_report)

* Random Forest

In [None]:
rf_test_pred = rf.predict(X_test)
rf_test_pred_proba = rf.predict_proba(X_test)[:, 1]
rf_test_accuracy = accuracy_score(y_test, rf_test_pred)
rf_test_precision = precision_score(y_test, rf_test_pred)
rf_test_recall = recall_score(y_test, rf_test_pred)
rf_test_f1 = f1_score(y_test, rf_test_pred)
rf_test_roc_auc = roc_auc_score(y_test, rf_test_pred_proba)
rf_test_cm = confusion_matrix(y_test, rf_test_pred)
rf_test_report = classification_report(y_test, rf_test_pred)

print("Random Forest:")
print("\nTesting Metrics:")
print(f"Accuracy: {rf_test_accuracy:.4f}")
print(f"Precision: {rf_test_precision:.4f}")
print(f"Recall: {rf_test_recall:.4f}")
print(f"F1-score: {rf_test_f1:.4f}")
print(f"ROC AUC Score: {rf_test_roc_auc:.4f}")
print("Confusion Matrix:")
print(rf_test_cm)
print("Testing Classification Report:")
print(rf_test_report)

* LightGBM

In [None]:
lgbm_test_pred = lgbm.predict(X_test)
lgbm_test_pred_proba = lgbm.predict_proba(X_test)[:, 1]
lgbm_test_accuracy = accuracy_score(y_test, lgbm_test_pred)
lgbm_test_precision = precision_score(y_test, lgbm_test_pred)
lgbm_test_recall = recall_score(y_test, lgbm_test_pred)
lgbm_test_f1 = f1_score(y_test, lgbm_test_pred)
lgbm_test_roc_auc = roc_auc_score(y_test, lgbm_test_pred_proba)
lgbm_test_cm = confusion_matrix(y_test, lgbm_test_pred)
lgbm_test_report = classification_report(y_test, lgbm_test_pred)

print("LightGBM:")
print("\nTesting Metrics:")
print(f"Accuracy: {lgbm_test_accuracy:.4f}")
print(f"Precision: {lgbm_test_precision:.4f}")
print(f"Recall: {lgbm_test_recall:.4f}")
print(f"F1-score: {lgbm_test_f1:.4f}")
print(f"ROC AUC Score: {lgbm_test_roc_auc:.4f}")
print("Confusion Matrix:")
print(lgbm_test_cm)
print("Testing Classification Report:")
print(lgbm_test_report)

# 9.Deployment

In [None]:
# Fit the CountVectorizer to the training data and save it
cv.fit(train_upsampled['tweet'])
with open('model_and_cv.pkl', 'wb') as file:
    pickle.dump((rf, cv), file)

In [None]:
# Load the trained model and CountVectorizer object
with open('model_and_cv.pkl', 'rb') as file:
    model, cv = pickle.load(file)

# Enter input text
input_text = "<input your text here>"

# Preprocess the input text
clean_input_text = clean_text(input_text)

# Vectorize the input text using the same CountVectorizer object used during training
vectorized_input = cv.transform([clean_input_text])

# Make predictions
prediction = model.predict(vectorized_input)
print(prediction)
# Interpret the results
if prediction == 1:
    print("The input text contains hate speech.")
else:
    print("The input text does not contain hate speech.")
