In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, classification_report
import joblib

In [3]:
# import torch
# from torch.utils.data import Dataset
# import torch.nn as nn
# import torch.nn.functional  as F

In [4]:
train = pd.read_csv("data/clean_train.csv")
test = pd.read_csv("data/clean_test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_length,clean_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,264,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,112,aww match background colour seemingly stuck th...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,233,hey man really trying edit war guy constantly ...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,622,make real suggestion improvement wondered sect...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,67,sir hero chance remember page


In [6]:
col_labels = [col for col in train.columns if col not in ["id","comment_text", "clean_text", "comment_length"]]

In [7]:
# train_data_features = vectorizer.fit_transform(clean_train_reviews)

In [8]:
train["clean_text"].isnull().sum()

55

In [9]:
test["clean_text"].isnull().sum()

974

In [10]:
train["clean_text"].fillna("unfilled", inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["clean_text"].fillna("unfilled", inplace = True)


In [11]:
test["clean_text"].fillna("unfilled", inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["clean_text"].fillna("unfilled", inplace = True)


In [33]:
X_train = train["clean_text"]
X_test = test["clean_text"]

In [34]:
y_train = train[col_labels]

In [35]:
nb_pipe = Pipeline([
    ("tvec", TfidfVectorizer()),
    ('nb_model', OneVsRestClassifier(MultinomialNB(), n_jobs=-1))
])

In [36]:
nb_pipe.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()),
  ('nb_model', OneVsRestClassifier(estimator=MultinomialNB(), n_jobs=-1))],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'nb_model': OneVsRestClassifier(estimator=MultinomialNB(), n_jobs=-1),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'nb_model__estimator__alpha': 1.0,
 'nb_model__estimator__class_prior': None,
 'nb_model__estimator__fit_prior': True,
 'nb_model__estimator__force_alpha': True,
 '

In [37]:
nb_pipe_params = {
    'tvec__max_features': [3000], # [500, 2000, 3000]
    'tvec__min_df': [2], # [1,2]
    'tvec__max_df': [.9], # [.9, .95]
    'tvec__ngram_range': [(1,1)],
    'nb_model__estimator__alpha': [1.0] # [0.1, 1.0, 10]
}

In [38]:
gs_nb = GridSearchCV(nb_pipe, # what object are we optimizing?
                  param_grid=nb_pipe_params, # what parameters values are we searching?
                  scoring = "roc_auc", 
                  cv=5) # 5-fold cross-validation.

In [39]:
%%timeit
gs_nb.fit(X_train, y_train)

1min 37s ± 3.42 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
gs_nb.best_estimator_

In [41]:
gs_nb.best_score_

0.9601653850640478

In [42]:
gs_nb.best_params_

{'nb_model__estimator__alpha': 1.0,
 'tvec__max_df': 0.9,
 'tvec__max_features': 3000,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 1)}

In [22]:
gs_nb.predict_proba(X_test)

array([[9.65241257e-01, 2.30751813e-01, 9.14224739e-01, 5.55140275e-02,
        8.59958689e-01, 2.44290356e-01],
       [2.76310882e-02, 2.57072341e-03, 1.52614551e-02, 1.11029919e-03,
        1.53047746e-02, 3.46058284e-03],
       [3.54617423e-02, 1.55105093e-03, 1.62609608e-02, 3.20100303e-04,
        1.40088552e-02, 1.52523894e-03],
       ...,
       [1.06094082e-02, 5.57448099e-04, 3.86136927e-03, 6.93532654e-04,
        2.85643571e-03, 1.37662590e-03],
       [3.98593598e-02, 7.95602215e-04, 1.20816184e-02, 1.43469563e-03,
        1.19386874e-02, 7.77392139e-03],
       [8.07886211e-02, 1.97933838e-03, 2.83092454e-02, 1.84362953e-03,
        2.69291294e-02, 2.64507390e-03]])

In [43]:
sample_submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.5,0.5,0.5,0.5,0.5,0.5
153160,fffd7a9a6eb32c16,0.5,0.5,0.5,0.5,0.5,0.5
153161,fffda9e8d6fafa9e,0.5,0.5,0.5,0.5,0.5,0.5
153162,fffe8f1340a79fc2,0.5,0.5,0.5,0.5,0.5,0.5


In [44]:
sample_submission.loc[:, col_labels] = gs_nb.predict_proba(X_test)

In [45]:
sample_submission.to_csv("data/naive_bayes_pred.csv", index = False)

In [46]:
logreg_pipe = Pipeline([
    ("tvec", TfidfVectorizer()),
    ('logreg_model', OneVsRestClassifier(LogisticRegression(), n_jobs=-1))
])

In [47]:
logreg_pipe.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()),
  ('logreg_model',
   OneVsRestClassifier(estimator=LogisticRegression(), n_jobs=-1))],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'logreg_model': OneVsRestClassifier(estimator=LogisticRegression(), n_jobs=-1),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'logreg_model__estimator__C': 1.0,
 'logreg_model__estimator__class_weight': None,
 'logreg_model__estimator__dual': False,
 'logreg_model__est

In [48]:
logreg_pipe_params = {
    'tvec__max_features': [3_000],
    'tvec__min_df': [2],
    'tvec__max_df': [.9],
    'tvec__ngram_range': [(1,1)],
    'logreg_model__estimator__C': [0.1, 1, 10], # [0.1, 1.0, 10]
    'logreg_model__estimator__class_weight': ["balanced"],
    'logreg_model__estimator__solver': ['liblinear'],
    'logreg_model__estimator__max_iter': [1000]
    
}

In [49]:
gs_logreg = GridSearchCV(logreg_pipe, # what object are we optimizing?
                  param_grid=logreg_pipe_params, # what parameters values are we searching?
                  scoring = "roc_auc", 
                  cv=5) # 5-fold cross-validation.

In [50]:
%%timeit
gs_logreg.fit(X_train, y_train)

57 s ± 275 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [51]:
gs_logreg.best_params_

{'logreg_model__estimator__C': 0.1,
 'logreg_model__estimator__class_weight': 'balanced',
 'logreg_model__estimator__max_iter': 1000,
 'logreg_model__estimator__solver': 'liblinear',
 'tvec__max_df': 0.9,
 'tvec__max_features': 3000,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 1)}

In [52]:
gs_logreg.best_estimator_

In [53]:
gs_logreg.best_score_

0.9704503182395907

In [54]:
gs_logreg.predict_proba(X_test)

array([[0.99632431, 0.9705334 , 0.99823294, 0.93234942, 0.99430633,
        0.98211169],
       [0.14560822, 0.0812623 , 0.08681062, 0.07845364, 0.14187269,
        0.13309194],
       [0.0842515 , 0.02213896, 0.04939567, 0.00820227, 0.05927441,
        0.03025196],
       ...,
       [0.07188132, 0.0625952 , 0.09013437, 0.05261374, 0.06572461,
        0.09415762],
       [0.26766136, 0.07471515, 0.23150598, 0.15094251, 0.19766498,
        0.56845828],
       [0.96291309, 0.05368836, 0.95686398, 0.17193042, 0.86203955,
        0.18525994]])

In [55]:
sample_submission.loc[:,col_labels] = gs_logreg.predict_proba(X_test)

In [57]:
sample_submission.to_csv("data/logreg_pred.csv", index = False)

In [61]:
# # Save the model
joblib.dump(gs_logreg, 'data/gs_logreg.pkl')

['data/gs_logreg.pkl']

In [62]:
joblib.dump(gs_nb, 'data/gs_nb.pkl')

['data/gs_nb.pkl']