In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Load data
pd.set_option('display.max_colwidth', None)
df_train = pd.read_csv('/content/gdrive/MyDrive/Team Blue NLP/train_cleaned.csv',
                       usecols = ['avg_count_caps', 'comment_cleaned_spell_no_stopwords_lemm', 'toxic'])
df_test = pd.read_csv('/content/gdrive/MyDrive/Team Blue NLP/test_cleaned.csv',
                       usecols = ['avg_count_caps', 'comment_cleaned_spell_no_stopwords_lemm', 'toxic'])

df_train.loc[:, 'train_test'] = 'train'
df_test.loc[:, 'train_test'] = 'test'

df = pd.concat([df_train, df_test])
df.head()

Unnamed: 0,avg_count_caps,comment_cleaned_spell_no_stopwords_lemm,toxic,train_test
0,0.054422,list moroccan dutch people people wikipedia page order list exception see wp listpeople wp wtaf cheer,0,train
1,0.012658,much well careful someone report ban vandalism,0,train
2,0.022857,block pair write encyclopaedia plenty place play internet use hasten trek,0,train
3,0.020833,thank thanks link reference desk crux orthodoxa good read pretty much look seem cross slightly ambiguous define reason extra beam thank,0,train
4,0.034146,know _number_ bible scholar disagree yet liberal wikipedia lazy look anything attack bible liberal presupposition anti christian bigotry,0,train


In [3]:
df.rename(columns = {'comment_cleaned_spell_no_stopwords_lemm':'comment_cleaned'},
          inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159571 entries, 0 to 31914
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   avg_count_caps   159571 non-null  float64
 1   comment_cleaned  159548 non-null  object 
 2   toxic            159571 non-null  int64  
 3   train_test       159571 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.1+ MB


In [4]:
# Drop rows with missing values
df = df.dropna().reset_index(drop = True)
df.shape

(159548, 4)

In [5]:
# Roughly a 90/10 split between non-toxic and toxic comments for train and test sets
df.groupby('train_test')['toxic'].value_counts(normalize = True).round(3)

train_test  toxic
test        0        0.904
            1        0.096
train       0        0.904
            1        0.096
Name: toxic, dtype: float64

In [6]:
def tf_idf(data, max_features = None, ngram_range = (1, 1)):

  t_vectorizer = TfidfVectorizer(max_features = max_features, ngram_range = ngram_range)
  X = t_vectorizer.fit_transform(data)
  terms = t_vectorizer.get_feature_names_out()
  return pd.DataFrame(X.toarray(), columns = terms)

In [7]:
# Vectorize using TF-IDF
unigrams = tf_idf(df['comment_cleaned'], max_features = 300, ngram_range = (1, 1))
print(f'Shape unigrams: {unigrams.shape}')
bigrams = tf_idf(df['comment_cleaned'], max_features = 300, ngram_range = (2, 2))
print(f'Shape bigrams: {bigrams.shape}')
trigrams = tf_idf(df['comment_cleaned'], max_features = 300, ngram_range = (3, 3))
print(f'Shape trigrams: {trigrams.shape}')

Shape unigrams: (159548, 300)
Shape bigrams: (159548, 300)
Shape trigrams: (159548, 300)


In [8]:
unigrams_final = pd.concat([unigrams, df], axis = 1)
bigrams_final = pd.concat([bigrams, df], axis = 1)
trigrams_final = pd.concat([trigrams, df], axis = 1)

In [None]:
df_list = [('TF-IDF (unigrams)', unigrams_final), 
           ('TF-IDF (bigrams)', bigrams_final), 
           ('TF-IDF (trigrams)', trigrams_final)]

model_list = [('Logistic Regression', LogisticRegression(max_iter = 200)),
              ('Random Forest', RandomForestClassifier())]

results_list = []

def scoring(y, y_pred):
  precision = precision_score(y, y_pred)
  recall = recall_score(y, y_pred)
  f1 = f1_score(y, y_pred)
  return precision, recall, f1

for df_label, data in df_list:
  for model_label, model in model_list:
    print(f'Processing: {df_label}, {model_label}')
    drop_cols = ['comment_cleaned', 'train_test']
    train = data[data['train_test'] == 'train'].drop(columns = drop_cols)
    test = data[data['train_test'] == 'test'].drop(columns = drop_cols)

    M = model
    M.fit(train.drop(columns = 'toxic'), train.loc[:, 'toxic'])
    y_proba = M.predict_proba(test.drop(columns = 'toxic'))[:, 1]
    for cutoff in [0.05, 0.10, 0.15]:
      y_pred = np.where(y_proba >= cutoff, 1, 0)
      y = test.loc[:, 'toxic']
      precision, recall, f1 = scoring(y, y_pred)
      results_list.append([df_label, model_label, cutoff, precision, recall, f1])

results = pd.DataFrame(
            results_list,
            columns = ['features', 'model', 'cutoff', 'precision', 'recall', 'f1']
            )

results.set_index(['features', 'model', 'cutoff'], inplace = True)

Processing: TF-IDF (unigrams), Logistic Regression
Processing: TF-IDF (unigrams), Random Forest
Processing: TF-IDF (bigrams), Logistic Regression
Processing: TF-IDF (bigrams), Random Forest
Processing: TF-IDF (trigrams), Logistic Regression
Processing: TF-IDF (trigrams), Random Forest


In [None]:
results.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precision,recall,f1
features,model,cutoff,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TF-IDF (unigrams),Logistic Regression,0.05,0.22,0.896,0.353
TF-IDF (unigrams),Logistic Regression,0.1,0.32,0.783,0.454
TF-IDF (unigrams),Logistic Regression,0.15,0.423,0.66,0.515
TF-IDF (unigrams),Random Forest,0.05,0.216,0.854,0.345
TF-IDF (unigrams),Random Forest,0.1,0.326,0.744,0.454
TF-IDF (unigrams),Random Forest,0.15,0.412,0.666,0.509
TF-IDF (bigrams),Logistic Regression,0.05,0.125,0.93,0.22
TF-IDF (bigrams),Logistic Regression,0.1,0.131,0.731,0.222
TF-IDF (bigrams),Logistic Regression,0.15,0.412,0.244,0.307
TF-IDF (bigrams),Random Forest,0.05,0.143,0.746,0.24


In [None]:
# Max Precision
results[results.precision == results.precision.max()].round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precision,recall,f1
features,model,cutoff,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TF-IDF (unigrams),Logistic Regression,0.15,0.423,0.66,0.515


In [None]:
# Max Recall
results[results.recall == results.recall.max()].round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precision,recall,f1
features,model,cutoff,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TF-IDF (trigrams),Logistic Regression,0.05,0.105,0.988,0.19


In [None]:
# Max F1-Score
results[results.f1 == results.f1.max()].round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precision,recall,f1
features,model,cutoff,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TF-IDF (unigrams),Logistic Regression,0.15,0.423,0.66,0.515


In [None]:
best_model = LogisticRegression(max_iter = 200)

drop_cols = ['comment_cleaned', 'train_test']
train = unigrams_final[unigrams_final['train_test'] == 'train'].drop(columns = drop_cols)
test = unigrams_final[unigrams_final['train_test'] == 'test'].drop(columns = drop_cols)

best_model.fit(train.drop(columns = 'toxic'), train.loc[:, 'toxic'])
y_proba = best_model.predict_proba(test.drop(columns = 'toxic'))[:, 1]
y = test.loc[:, 'toxic']
print(f'AUC = {roc_auc_score(y, y_proba):.3f}')

AUC = 0.891
