In [5]:
import numpy as np
import pandas as pd
import re
import time
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV,StratifiedShuffleSplit
from sklearn.metrics import   roc_auc_score,multilabel_confusion_matrix
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.feature_selection import SelectKBest,mutual_info_classif,f_classif

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display

submission_file = pd.read_csv( '../.data/sample_submission.csv')


#### For now we want to have a glance at a baseline model, so we are not using nothing besides the comment_text with TF-IDF and a Logistic Regression with no tunning

- Not using the features created in 'EDA.ipynb'

In [6]:
def import_train_data():
    Xtrain = pd.read_csv('../.data/train_new_features.csv',index_col='id')
    Xtr = Xtrain[['comment_text']]
    ytr = Xtrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']]   
    return Xtr,ytr

def feature_extraction(Xtr,n_grams=(1,1),max_features=10000):
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        use_idf=True,
        strip_accents='unicode',
        stop_words='english',
        min_df=20,
        ngram_range=n_grams,
        max_features=max_features)
    
    start = time.time()
    print('Extracting fetures...')
    train_features = word_vectorizer.fit_transform(Xtr['comment_text'])
    print(f'Time elapsed: {time.time() - start} seconds. ')
    param = word_vectorizer.get_params()
    return train_features,param

def train_model(Xtr,ytr,df_results,param_extractor,c_weight='balanced'):
    
    clf = LogisticRegression(class_weight=c_weight) #'balanced'
    param_classifier = clf.get_params()
    score_df = pd.DataFrame(columns=ytr.columns)
    scores = []
    
    for col in ytr.columns:
        cv_score = np.round(np.mean(cross_val_score(clf,
                                           Xtr,
                                           ytr[col],
                                           cv=3,
                                           scoring='roc_auc')),4)
        scores.append(cv_score)
        print(f'CV score for label {col} is {cv_score}.')
        
    scores.append(param_extractor)
    scores.append(param_classifier)
    df_results = df_results.append(dict(zip(df_results.columns,scores)), ignore_index = True)
    
    return df_results

In [8]:
x_train, y_train = import_train_data()
list_cols = y_train.columns.to_list()
list_cols.append('parameters_extraction')
list_cols.append('parameters_classifier')
score_df = pd.DataFrame(columns=list_cols)
# x_train_transformed,parameters_feat_ext = feature_extraction(x_train)

## Test different class weights

In [6]:
weights = [{0:1,1:1},{0:1,1:10},{0:1,1:100}]
for w in weights:
    score_df = train_model(x_train_transformed,y_train,
                           score_df,parameters_feat_ext,c_weight=w)
    

CV score for label toxic is 0.9675.
CV score for label severe_toxic is 0.9851.
CV score for label obscene is 0.9832.
CV score for label threat is 0.9825.
CV score for label insult is 0.9744.
CV score for label identity_hate is 0.9732.
CV score for label toxic is 0.9657.
CV score for label severe_toxic is 0.9828.
CV score for label obscene is 0.9813.
CV score for label threat is 0.9819.
CV score for label insult is 0.9729.
CV score for label identity_hate is 0.9702.
CV score for label toxic is 0.9624.
CV score for label severe_toxic is 0.9806.
CV score for label obscene is 0.9793.
CV score for label threat is 0.9801.
CV score for label insult is 0.9706.
CV score for label identity_hate is 0.9674.


In [7]:
display(score_df)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,parameters_extraction,parameters_classifier
0,0.9675,0.9851,0.9832,0.9825,0.9744,0.9732,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': None, 'dual': False..."
1,0.9675,0.9851,0.9832,0.9825,0.9744,0.9732,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': {0: 1, 1: 1}, 'dual..."
2,0.9657,0.9828,0.9813,0.9819,0.9729,0.9702,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': {0: 1, 1: 10}, 'dua..."
3,0.9624,0.9806,0.9793,0.9801,0.9706,0.9674,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': {0: 1, 1: 100}, 'du..."


Does not seem to be improving our scores so far.

In [8]:
list_cols = y_train.columns.to_list()
list_cols.append('parameters_extraction')
list_cols.append('parameters_classifier')
score_df = pd.DataFrame(columns=list_cols)

for n_gram_range in [(1,2),(1,3),(1,4),(2,3)]:
    x_train_transformed,parameters_feat_ext = feature_extraction(x_train,n_gram_range)
    score_df = train_model(x_train_transformed,y_train,
                           score_df,parameters_feat_ext)

Extracting fetures...
Time elapsed: 18.977115392684937 seconds. 
CV score for label toxic is 0.9652.
CV score for label severe_toxic is 0.9815.
CV score for label obscene is 0.9807.
CV score for label threat is 0.9808.
CV score for label insult is 0.9726.
CV score for label identity_hate is 0.9693.
Extracting fetures...
Time elapsed: 36.379775047302246 seconds. 
CV score for label toxic is 0.9645.
CV score for label severe_toxic is 0.9813.
CV score for label obscene is 0.9804.
CV score for label threat is 0.9793.
CV score for label insult is 0.9722.
CV score for label identity_hate is 0.9679.
Extracting fetures...
Time elapsed: 54.11032557487488 seconds. 
CV score for label toxic is 0.964.
CV score for label severe_toxic is 0.9813.
CV score for label obscene is 0.9803.
CV score for label threat is 0.9792.
CV score for label insult is 0.972.
CV score for label identity_hate is 0.9673.
Extracting fetures...
Time elapsed: 32.74915313720703 seconds. 
CV score for label toxic is 0.7668.
CV 

In [9]:
display(score_df)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,parameters_extraction,parameters_classifier
0,0.9652,0.9815,0.9807,0.9808,0.9726,0.9693,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
1,0.9645,0.9813,0.9804,0.9793,0.9722,0.9679,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
2,0.964,0.9813,0.9803,0.9792,0.972,0.9673,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
3,0.7668,0.8384,0.777,0.7882,0.7822,0.7627,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': 'balanced', 'dual':..."


No sucess.
We could add more to max_features instead.

In [9]:
list_cols = y_train.columns.to_list()
list_cols.append('parameters_extraction')
list_cols.append('parameters_classifier')
score_df = pd.DataFrame(columns=list_cols)

for max_feats in [1.5e4,3e4,5e4]:
    print(max_feats)
    x_train_transformed,parameters_feat_ext = feature_extraction(x_train,
                                                                 max_features=int(max_feats))
    score_df = train_model(x_train_transformed,y_train,
                           score_df,parameters_feat_ext)

15000.0
Extracting fetures...
Time elapsed: 6.39243483543396 seconds. 
CV score for label toxic is 0.9689.
CV score for label severe_toxic is 0.9834.
CV score for label obscene is 0.9833.
CV score for label threat is 0.9814.
CV score for label insult is 0.9755.
CV score for label identity_hate is 0.9706.
30000.0
Extracting fetures...
Time elapsed: 7.328493118286133 seconds. 
CV score for label toxic is 0.9689.
CV score for label severe_toxic is 0.9834.
CV score for label obscene is 0.9833.
CV score for label threat is 0.9814.
CV score for label insult is 0.9755.
CV score for label identity_hate is 0.9706.
50000.0
Extracting fetures...
Time elapsed: 7.240565061569214 seconds. 
CV score for label toxic is 0.9689.
CV score for label severe_toxic is 0.9834.
CV score for label obscene is 0.9833.
CV score for label threat is 0.9814.
CV score for label insult is 0.9755.
CV score for label identity_hate is 0.9706.


In [15]:
display(score_df)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,parameters_extraction,parameters_classifier
0,0.9689,0.9834,0.9833,0.9814,0.9755,0.9706,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
1,0.9689,0.9834,0.9833,0.9814,0.9755,0.9706,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
2,0.9689,0.9834,0.9833,0.9814,0.9755,0.9706,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': 'balanced', 'dual':..."


Now we seem to reach something, but the model does not seem to show any improvement with more than 15k terms.
We could try now work with both max_features and n_gram_range.


In [13]:
list_cols = y_train.columns.to_list()
list_cols.append('parameters_extraction')
list_cols.append('parameters_classifier')
score_df = pd.DataFrame(columns=list_cols)
# 3e4 and (1,2)
for max_feats in [1.5e4,3e4]:
    for n_gram_range in [(1,2),(1,3)]:
    
        x_train_transformed,parameters_feat_ext = feature_extraction(x_train,
                                                                     n_gram_range,
                                                                     int(max_feats))
        score_df = train_model(x_train_transformed,y_train,
                               score_df,parameters_feat_ext)
        

Extracting fetures...
Time elapsed: 18.222862005233765 seconds. 
CV score for label toxic is 0.9669.
CV score for label severe_toxic is 0.9828.
CV score for label obscene is 0.9818.
CV score for label threat is 0.9807.
CV score for label insult is 0.9738.
CV score for label identity_hate is 0.9707.
Extracting fetures...
Time elapsed: 35.48916292190552 seconds. 
CV score for label toxic is 0.9664.
CV score for label severe_toxic is 0.9823.
CV score for label obscene is 0.9813.
CV score for label threat is 0.9808.
CV score for label insult is 0.9732.
CV score for label identity_hate is 0.9707.
Extracting fetures...
Time elapsed: 19.025423765182495 seconds. 
CV score for label toxic is 0.9691.
CV score for label severe_toxic is 0.9839.
CV score for label obscene is 0.9835.
CV score for label threat is 0.9822.
CV score for label insult is 0.9757.
CV score for label identity_hate is 0.9713.
Extracting fetures...
Time elapsed: 36.50981831550598 seconds. 
CV score for label toxic is 0.9688.
C

In [None]:
# Once best hyperparameter for feature extraction:

Model Diagnosing:
    - from sklearn.model_selection import learning_curve

In [None]:
def plot_l_curve(estimator, X, y, cv=5, n_jobs=4):
    if axes is None:
        _, axes = plt.subplots(1, 1, figsize=(5, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, valid_scores = learning_curve(model,
                                                             X,
                                                             y,
                                                             train_sizes=[0.05,0.1,0.2,0.50.75],
                                                             cv=cv)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")
    plt.show()
    
plot_l_curve(estimator, X, y, cv=cv, n_jobs=4)