In [1]:
import numpy as np
import pandas as pd
import re
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV,StratifiedShuffleSplit
from sklearn.metrics import   roc_auc_score,multilabel_confusion_matrix
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.feature_selection import SelectKBest,mutual_info_classif,f_classif
    
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords

submission_file = pd.read_csv( '../.data/sample_submission.csv')

#### For now we want to have a glance at a baseline model, so we are not using nothing besides the comment_text with TF-IDF and a Logistic Regression with no tunning

- Not using the features created in 'EDA.ipynb'

In [3]:
def import_train_data():
    Xtrain = pd.read_csv('../.data/train_new_features.csv',index_col='id')
    Xtr = Xtrain[['comment_text']]
    ytr = Xtrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']]   
    return Xtr,ytr

def feature_extraction(Xtr):
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        use_idf=True,
        strip_accents='unicode',
        stop_words='english',
        min_df=20,
        ngram_range=(1, 1),
        max_features=10000)
    
    
    start = time.time()
    print('Extracting fetures...')
    train_features = word_vectorizer.fit_transform(Xtr['comment_text'])
    print(f'Time elapsed: {time.time() - start} seconds. ')
    param = word_vectorizer.get_params()
    return train_features,param

def train_model(Xtr,ytr,df_results,param_extractor):
    
    clf = LogisticRegression(class_weight=None) #'balanced'
    param_classifier = clf.get_params()
    score_df = pd.DataFrame(columns=ytr.columns)
    scores = []
    
    for col in ytr.columns:
        cv_score = np.round(np.mean(cross_val_score(clf,
                                           Xtr,
                                           ytr[col],
                                           cv=3,
                                           scoring='roc_auc')),4)
        scores.append(cv_score)
        print(f'CV score for label {col} is {cv_score}.')
        
    scores.append(param_extractor)
    scores.append(param_classifier)
    df_results = df_results.append(dict(zip(df_results.columns,scores)), ignore_index = True)
    
    return df_results


x_train, y_train = import_train_data()
list_cols = y_train.columns.to_list()
list_cols.append('parameters_extraction')
list_cols.append('parameters_classifier')
score_df = pd.DataFrame(columns=list_cols)
x_train_transformed,parameters_feat_ext = feature_extraction(x_train)

Extracting fetures...
Time elapsed: 6.094881296157837 seconds. 


In [4]:
score_df = train_model(x_train_transformed,y_train,score_df,parameters_feat_ext)

CV score for label toxic is 0.9675.
CV score for label severe_toxic is 0.9851.
CV score for label obscene is 0.9832.
CV score for label threat is 0.9825.
CV score for label insult is 0.9744.
CV score for label identity_hate is 0.9732.


In [5]:
score_df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,parameters_extraction,parameters_classifier
0,0.9675,0.9851,0.9832,0.9825,0.9744,0.9732,"{'analyzer': 'word', 'binary': False, 'decode_...","{'C': 1.0, 'class_weight': None, 'dual': False..."


In [None]:
# Xtest = pd.read_csv('../.data/test_new_features.csv',index_col='id')
# Xts = Xtest[['comment_text']]