In [41]:
import pandas as pd
import re
import spacy
import pickle
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from utils import read_file, preprocess, clean_twitter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.multiclass import OneVsOneClassifier
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [3]:
tqdm.pandas()
nlp = spacy.load('en_core_web_sm')

In [4]:
data, y = read_file('../hatespeech/', with_evaluation=True)
data = [preprocess(text) for text in data]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2)

In [27]:
vectorizer = CountVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = OneVsOneClassifier(svm.LinearSVC(class_weight='balanced', C=0.1, max_iter=2000), n_jobs=-1)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.82      0.89     40159
           1       0.61      0.88      0.72     10964
           2       0.92      0.93      0.92     13639
           3       0.60      0.95      0.74      2061

    accuracy                           0.86     66823
   macro avg       0.78      0.90      0.82     66823
weighted avg       0.89      0.86      0.87     66823

---------------Test metrics------------------------
              precision    recall  f1-score   support

           0       0.89      0.75      0.81     10128
           1       0.44      0.67      0.53      2644
           2       0.88      0.86      0.87      3450
           3       0.33      0.56      0.42       484

    accuracy                           0.75     16706
   macro avg       0.64      0.71      0.66     16706
weighted avg       0.80      0.75      0.77     16706



In [23]:
# features = np.array(vectorizer.get_feature_names())
# features[np.argsort(classifier.estimators_[0].coef_[0])[::-1][:50]]

In [30]:
vectorizer = TfidfVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = OneVsOneClassifier(svm.LinearSVC(class_weight='balanced', C=0.1, max_iter=2000), n_jobs=-1)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.79      0.86     40159
           1       0.56      0.83      0.67     10964
           2       0.91      0.87      0.89     13639
           3       0.47      0.91      0.62      2061

    accuracy                           0.82     66823
   macro avg       0.72      0.85      0.76     66823
weighted avg       0.86      0.82      0.83     66823

---------------Test metrics------------------------
              precision    recall  f1-score   support

           0       0.90      0.75      0.82     10128
           1       0.46      0.70      0.56      2644
           2       0.89      0.83      0.86      3450
           3       0.32      0.63      0.42       484

    accuracy                           0.76     16706
   macro avg       0.64      0.73      0.66     16706
weighted avg       0.81      0.76      0.77     16706



In [31]:
vectorizer = CountVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = OneVsOneClassifier(LogisticRegression(class_weight='balanced', C=0.1, max_iter=2000), n_jobs=-1)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.76      0.84     40159
           1       0.52      0.81      0.63     10964
           2       0.90      0.86      0.88     13639
           3       0.42      0.83      0.56      2061

    accuracy                           0.79     66823
   macro avg       0.70      0.81      0.73     66823
weighted avg       0.85      0.79      0.81     66823

---------------Test metrics------------------------
              precision    recall  f1-score   support

           0       0.90      0.73      0.81     10128
           1       0.45      0.72      0.55      2644
           2       0.89      0.83      0.86      3450
           3       0.30      0.61      0.40       484

    accuracy                           0.75     16706
   macro avg       0.64      0.72      0.66     16706
weighted avg       0.81      0.75      0.77     16706



In [32]:
vectorizer = TfidfVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = OneVsOneClassifier(LogisticRegression(class_weight='balanced', C=0.1, max_iter=2000), n_jobs=-1)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.76      0.82     40159
           1       0.49      0.77      0.60     10964
           2       0.91      0.81      0.86     13639
           3       0.37      0.70      0.48      2061

    accuracy                           0.77     66823
   macro avg       0.67      0.76      0.69     66823
weighted avg       0.82      0.77      0.78     66823

---------------Test metrics------------------------
              precision    recall  f1-score   support

           0       0.89      0.74      0.81     10128
           1       0.45      0.72      0.55      2644
           2       0.90      0.79      0.84      3450
           3       0.30      0.62      0.40       484

    accuracy                           0.74     16706
   macro avg       0.63      0.72      0.65     16706
weighted avg       0.80      0.74      0.76     16706



In [34]:
vectorizer = CountVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 42)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     40159
           1       0.96      0.94      0.95     10964
           2       0.99      1.00      0.99     13639
           3       0.99      0.98      0.99      2061

    accuracy                           0.98     66823
   macro avg       0.98      0.98      0.98     66823
weighted avg       0.98      0.98      0.98     66823

---------------Test metrics------------------------
              precision    recall  f1-score   support

           0       0.83      0.91      0.86     10128
           1       0.56      0.36      0.44      2644
           2       0.86      0.92      0.89      3450
           3       0.60      0.21      0.31       484

    accuracy                           0.80     16706
   macro avg       0.71      0.60      0.63     16706
weighted avg       0.78      0.80      0.79     16706



In [38]:
features = np.array(vectorizer.get_feature_names())
features[np.argsort(classifier.feature_importances_)[::-1]][:50]

array(['fuck', 'bitch', 'idiot', 'ass', 'nigga', 'shit', 'hate', 'stupid',
       'free', 'damn', 'new', 'thi', 'bad', 'love', 'video', 'ugli', 'wa',
       'pussi', 'fuckin', 'check', 'click', 'like', 'dick', 'just', 'don',
       'download', 'wanna', 'good', 'sick', 'enter', 'thank', 'time',
       'black', 'hell', 'retard', 'nasti', 'girl', 'say', 'know', 'ha',
       'think', 'hi', 'mad', 'join', 'sex', 'great', 'vintag', 'bullshit',
       'peopl', 'want'], dtype='<U21')

In [42]:
data, y = read_file('../hatespeech/', with_evaluation=True)
data = [clean_twitter(text) for text in data]
df = pd.DataFrame(columns=['text', 'label'])
df['text'] = data
df['label'] = y

In [44]:
df.to_pickle('~/df.pkl')

In [None]:
df_abusive = df[df['label'] == 'abusive']

In [None]:
df_abusive['cleaned'] = df_abusive['text'].apply(lambda x : clean(x))
df_abusive['lemmatized'] = df_abusive['cleaned'].apply(lambda x: lemmatize(x))

In [None]:
bow_vectorizer = CountVectorizer(input='content',
                                 analyzer='word',
                                 strip_accents='ascii',
                                 ngram_range=(2,5),
                                 stop_words='english',
                                 min_df=10)
X = bow_vectorizer.fit_transform(df_abusive['lemmatized'])

In [None]:
feature_names = np.array(bow_vectorizer.get_feature_names())
feature_names[np.argsort(np.array(X.sum(axis=0))[0])[::-1]][:50].tolist()

In [None]:
tfidf_vectorizer = TfidfVectorizer(input='content',
                                   analyzer='word',
                                   strip_accents='ascii',
                                   ngram_range=(1,3),
                                   min_df=5,
                                   stop_words='english',
                                   use_idf = True)

In [None]:
trainX = tfidf_vectorizer.fit_transform(train_df['lemmatized'])
testX = tfidf_vectorizer.transform(test_df['lemmatized'])

In [None]:
features = tfidf_vectorizer.get_feature_names()
len(features)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 42) #entropy
rf_classifier.fit(trainX, trainY)

In [None]:
predY = rf_classifier.predict(trainX)
print(classification_report(trainY, predY))

In [None]:
predY = rf_classifier.predict(testX)
print(classification_report(testY, predY))

In [None]:
rf_imps = rf_classifier.feature_importances_
sort_ind = np.argsort(rf_imps)
top_n = sort_ind[::-1][:100]
sorted_imps = rf_imps[top_n]
top_feats = np.array(features)[top_n]

plt.figure(figsize=(6, 30))
plt.xlabel('Relative Importance', fontsize=20)
plt.ylabel('Feature', fontsize=20)
sns.barplot(y = top_feats, x = sorted_imps)
plt.tick_params(labelsize=20)
plt.show()

In [None]:
np.array(features)[top_n]

In [None]:
lr_classifier = OneVsOneClassifier(LogisticRegression(class_weight='balanced', C=1, max_iter=7000), n_jobs=-1)
lr_classifier.fit(trainX, trainY)

In [None]:
predY = lr_classifier.predict(trainX)
print(classification_report(trainY, predY))

In [None]:
predY = lr_classifier.predict(testX)
print(classification_report(testY, predY))