In [1]:
import pandas as pd
import numpy as np
import datetime
import string
from collections import Counter
from scipy.sparse import hstack, csr_matrix

In [2]:
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk import ngrams
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb



In [3]:
df_train_initial = pd.read_csv('train.csv.zip')
df_test_initial = pd.read_csv('test.csv.zip')
df_sub = pd.read_csv('sample_submission.csv.zip')

initialcols = list(df_train_initial.columns[df_train_initial.dtypes == 'int64'])

badwords_short = pd.read_csv('badwords_short.txt',header=None)
badwords_short.rename(columns={0:'badwords_short'},inplace=True)
badwords_short['badwords_short'] = badwords_short['badwords_short'].str.lower()
badwords_short = badwords_short.drop_duplicates().reset_index(drop=True)
badwords_short_set = set(badwords_short['badwords_short'].str.replace('*',''))


In [4]:
tokenizer = RegexpTokenizer(r'\w+')

def get_ngrams(message):
    only_words = tokenizer.tokenize(message)
    filtered_message = ' '.join(only_words)
    filtered_message_list = list(ngrams(filtered_message.split(),2))
    filtered_message_list.extend(list(ngrams(filtered_message.split(),3)))
    #filtered_message = [i for i in filtered_message if all(j.isnumeric()==False for j in i)]
    return filtered_message_list

def get_words(message):
    only_words = tokenizer.tokenize(message)
    return only_words

def get_puncts(message):
    only_puncts = [i for i in message.split() if all(j in string.punctuation for j in i)]
    return only_puncts

def get_badwords(message):
    only_bad=[]
    for word in badwords_short_set:
        count = message.lower().count(word)
        if count>0:
            for i in range(0,count):
                only_bad.append('found_in_badwords_short_'+word)
    return only_bad 

model= {}
y_train= {}
y_test = {}
preds={}
preds_sub={}
proc={}
vec={}
vec_test={}
combined={}

In [5]:
def make_model(flags,test=True):
     
    if test==True:
        for col in flags:
            X_train, X_test, y_train[col], y_test[col] = train_test_split(df_train_initial.comment_text, 
                                                                          df_train_initial[col], 
                                                                          test_size=0.33, random_state=42)
    else:
        X_train = df_train_initial.comment_text.copy()
        X_test = df_test_initial.comment_text.copy()
        for col in flags:
            y_train[col] = df_train_initial[col].copy()
      
    proc['words'] = TfidfVectorizer(analyzer=get_words,min_df=3,strip_accents='unicode',sublinear_tf=1)
    proc['puncts']= TfidfVectorizer(analyzer=get_puncts,min_df=2,strip_accents='unicode',sublinear_tf=1)
    proc['ngrams']= TfidfVectorizer(analyzer=get_ngrams,min_df=4,strip_accents='unicode',sublinear_tf=1)
    proc['badwords']= TfidfVectorizer(analyzer=get_badwords,min_df=1,strip_accents='unicode',sublinear_tf=1)

    vec['words'] = proc['words'].fit_transform(X_train)
    vec['puncts'] = proc['puncts'].fit_transform(X_train)
    vec['ngrams'] = proc['ngrams'].fit_transform(X_train)
    vec['badwords'] = proc['badwords'].fit_transform(X_train)

    vec_test['words']=proc['words'].transform(X_test)
    vec_test['puncts']=proc['puncts'].transform(X_test)
    vec_test['ngrams']=proc['ngrams'].transform(X_test)
    vec_test['badwords']=proc['badwords'].transform(X_test)
        
    combined['train'] = hstack([vec['words'],vec['puncts'],vec['ngrams'],vec['badwords']])
    combined['test'] = hstack([vec_test['words'],vec_test['puncts'],vec_test['ngrams'],vec_test['badwords']])
    
    for col in flags:     
        model[col]={}
        
        model[col]['lr'] = LogisticRegression(solver='sag',C=3,max_iter=200,n_jobs=-1)
        model[col]['lr'].fit(combined['train'],y_train[col].tolist())
        
        model[col]['xgb'] = xgb.XGBClassifier(n_estimators=300, max_depth=5,objective= 'binary:logistic', 
                                              scale_pos_weight=1, seed=27, base_score = .2)
        model[col]['xgb'].fit(combined['train'],y_train[col].tolist(),eval_metric='auc')
        
        model[col]['gbc'] = GradientBoostingClassifier()
        model[col]['gbc'].fit(combined['train'],y_train[col].tolist())
        

        if test==True:
            preds[col]={}
            for i in model[col].keys():
                preds[col][i] = model[col][i].predict_proba(combined['test'])[:,1]
                print(col,i,'model predictions:\n',roc_auc_score(y_test[col],preds[col][i]))
                allpreds+=preds[col][i]
            allpreds/=3
            print(col,'model predictions:\n',roc_auc_score(y_test[col],allpreds))
        else:
            preds_sub[col]={}
            allpreds=np.zeros(combined['test'].shape[0])
            for i in model[col].keys():
                preds_sub[col][i] = model[col][i].predict_proba(combined['test'])[:,1]
                allpreds+=preds_sub[col][i]
            allpreds/=3
            df_sub[col] = allpreds
            print(col,'done')

In [6]:
make_model(initialcols,test=False)



toxic done
severe_toxic done
obscene done
threat done
insult done
identity_hate done


In [11]:
df_sub['toxic'] = preds_sub['toxic']['lr']
df_sub['severe_toxic'] = preds_sub['severe_toxic']['lr']
df_sub['obscene'] = preds_sub['obscene']['lr']
df_sub['threat'] = preds_sub['threat']['lr']
df_sub['insult'] = preds_sub['insult']['lr']
df_sub['identity_hate'] = preds_sub['identity_hate']['lr']

In [90]:
import pickle 
for i in vec.keys():
    pickle.dump(vec[i], open(i+'_vector.sav', 'wb'))

In [12]:
df_sub.to_csv('df_sub_'+datetime.datetime.now().strftime('%Y%m%d%I%M')+'.csv',index=False)

In [None]:
# C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:7: DeprecationWarning: generator 'ngrams' raised StopIteration
# toxic lr model predictions:
#  0.973623915807
# toxic xgb model predictions:
#  0.957367570947
# toxic gbc model predictions:
#  0.920677283411
# toxic model predictions:
#  0.967328623644
# severe_toxic lr model predictions:
#  0.988066880563
# severe_toxic xgb model predictions:
#  0.981223988455
# severe_toxic gbc model predictions:
#  0.946132712332
# severe_toxic model predictions:
#  0.987947888331
# obscene lr model predictions:
#  0.98715018023
# obscene xgb model predictions:
#  0.983366581819
# obscene gbc model predictions:
#  0.966495202699
# obscene model predictions:
#  0.987547215406
# threat lr model predictions:
#  0.984074679767
# threat xgb model predictions:
#  0.965280067921
# threat gbc model predictions:
#  0.542049593889
# threat model predictions:
#  0.983671224789
# insult lr model predictions:
#  0.98025063686
# insult xgb model predictions:
#  0.972816999733
# insult gbc model predictions:
#  0.953063786142
# insult model predictions:
#  0.978857091119
# identity_hate lr model predictions:
#  0.977883100898
# identity_hate xgb model predictions:
#  0.970471305196
# identity_hate gbc model predictions:
#  0.876133030069
# identity_hate model predictions:
#  0.979015052878