In [None]:
import pandas as pd
import numpy as np
import datetime
import string
from collections import Counter
from scipy.sparse import hstack, csr_matrix
import spacy
import pickle 

In [None]:
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk import ngrams
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import SparsePCA, TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [None]:
df_train_initial = pd.read_csv('train.csv.zip')
df_test_initial = pd.read_csv('test.csv.zip')
df_sub = pd.read_csv('sample_submission.csv.zip')

initialcols = list(df_train_initial.columns[df_train_initial.dtypes == 'int64'])

badwords_short = pd.read_csv('badwords_short.txt',header=None)
badwords_short.rename(columns={0:'badwords_short'},inplace=True)
badwords_short['badwords_short'] = badwords_short['badwords_short'].str.lower()
badwords_short = badwords_short.drop_duplicates().reset_index(drop=True)
badwords_short_set = set(badwords_short['badwords_short'].str.replace('*',''))


In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
regexp_tokenizer = RegexpTokenizer(r'\w+')

def get_lemma(message):
    only_words = regexp_tokenizer.tokenize(message)
    spacy_tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
    filtered_message = spacy_tokenizer(' '.join(only_words))
    filtered_message_list = [t.lemma_ for t in filtered_message]
    return filtered_message_list

def get_shape(message):
    only_words = regexp_tokenizer.tokenize(message)
    spacy_tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
    filtered_message = spacy_tokenizer(' '.join(only_words))
    filtered_message_list = list([t.shape_ for t in filtered_message])
    return filtered_message_list

def get_tag(message):
    only_words = regexp_tokenizer.tokenize(message)
    filtered_message = nlp(' '.join(only_words),disable=['textcat', 'parser', 'ner'])
    filtered_message_list = list([str(t.tag_) for t in filtered_message])
    return filtered_message_list

def get_oov(message):
    only_words = regexp_tokenizer.tokenize(message)
    spacy_tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
    filtered_message = spacy_tokenizer(' '.join(only_words))
    filtered_message_list = list([t.is_oov for t in filtered_message])
    return filtered_message_list

def get_ngram(message):
    only_words = regexp_tokenizer.tokenize(message)
    filtered_message = ' '.join(only_words)
    filtered_message_list = ngrams(filtered_message.split(),3)
    #filtered_message_list.extend(list(ngrams(filtered_message.split(),3)))
    #filtered_message = [i for i in filtered_message if all(j.isnumeric()==False for j in i)]
    return filtered_message_list

def get_punct(message):
    only_puncts = [i for i in message.split() if all(j in string.punctuation for j in i)]
    return only_puncts

model= {}
y_train= {}
y_test = {}
preds={}
preds_sub={}
proc={}
vec={}
vec_test={}
combined={}

In [None]:
def make_model(flags,test=True):
     
    if test==True:
        for col in flags:
            X_train, X_test, y_train[col], y_test[col] = train_test_split(df_train_initial.comment_text, 
                                                                          df_train_initial[col], 
                                                                          test_size=0.33, random_state=42)
    else:
        X_train = df_train_initial.comment_text.copy()
        X_test = df_test_initial.comment_text.copy()
        for col in flags:
            y_train[col] = df_train_initial[col].copy()
      
    proc['lemma'] = TfidfVectorizer(tokenizer=get_lemma,min_df=5,strip_accents='unicode',sublinear_tf=1)
    proc['punct']= TfidfVectorizer(tokenizer=get_punct,min_df=2,strip_accents='unicode',sublinear_tf=1)
    proc['shape']= TfidfVectorizer(tokenizer=get_shape,min_df=5,strip_accents='unicode',sublinear_tf=1)
    proc['tag']= TfidfVectorizer(tokenizer=get_tag,min_df=1,strip_accents='unicode',sublinear_tf=1)
    proc['ngram']= TfidfVectorizer(tokenizer=get_ngram,min_df=2,strip_accents='unicode',sublinear_tf=1)
    proc['oov']= TfidfVectorizer(tokenizer=get_oov,min_df=1,strip_accents='unicode',sublinear_tf=1)

    for i in proc.keys():
        vec[i] = proc[i].fit_transform(X_train)
        print('Fit vec:',i)
        vec_test[i] = proc[i].transform(X_test)
        print('Fit vec_test:',i)


    combined['train'] = hstack([vec[i] for i in vec])
    combined['test'] = hstack([vec_test[i] for i in vec_test])

    for col in flags:     
        
        model[col] = LogisticRegression(solver='sag',C=3,max_iter=250,n_jobs=-1)
        model[col].fit(combined['train'],y_train[col].tolist())
        
        if test==True:
            preds[col] = model[col].predict_proba(combined['test'])[:,1]
            print(col,'model predictions:\n',roc_auc_score(y_test[col],preds[col]))
        else:
            preds_sub[col] = model[col].predict_proba(combined['test'])[:,1]
            df_sub[col] = preds_sub[col]
            print(col,'done')

In [None]:
vec['lemma'] = pickle.load(open('lemma_vector.sav', 'rb'))
vec['punct']= pickle.load(open('punct_vector.sav', 'rb'))
vec['shape']= pickle.load(open('shape_vector.sav', 'rb'))
vec['tag']= pickle.load(open('tag_vector.sav', 'rb'))
vec['ngram']= pickle.load(open('ngram_vector.sav', 'rb'))
vec['oov']= pickle.load(open('oov_vector.sav', 'rb'))

vec_test['lemma'] = pickle.load(open('lemma_test_vector.sav', 'rb'))
vec_test['punct']= pickle.load(open('punct_test_vector.sav', 'rb'))
vec_test['shape']= pickle.load(open('shape_test_vector.sav', 'rb'))
vec_test['tag']= pickle.load(open('tag_test_vector.sav', 'rb'))
vec_test['ngram']= pickle.load(open('ngram_test_vector.sav', 'rb'))
vec_test['oov']= pickle.load(open('oov_test_vector.sav', 'rb'))

combined['train'] = hstack([vec[i] for i in vec])
combined['test'] = hstack([vec_test[i] for i in vec_test])

In [None]:
from googleapiclient import discovery

allresp={}
j=0

API_KEY='AIzaSyCnZK0yYz6ml1f_eWSaOrjY3Wt9AZUEtgs'

# Generates API client object dynamically based on service name and version.
service = discovery.build('commentanalyzer', 'v1alpha1', developerKey=API_KEY)

for i in df_train_initial.comment_text[0:10]:
    
    analyze_request = {
      'comment': { 'text': i },
      'requestedAttributes': {'TOXICITY': {},
                              'OBSCENE': {},
                              'ATTACK_ON_AUTHOR': {},
                              'ATTACK_ON_COMMENTER': {},
                              'SEVERE_TOXICITY': {}}
    }
    response = service.comments().analyze(body=analyze_request).execute()
    allresp[j]={}
    for k in response['attributeScores']:
        allresp[j][k] = response['attributeScores'][k]['summaryScore']['value']
    j+=1
    
allresp

# import json
# print (json.dumps(response, indent=2))


In [None]:
modellrSFM={}
sfm={}
sfm_train={}
sfm_test={}
for col in initialcols:
    modellrSFM[col] = SelectFromModel(model[col],prefit=True,threshold='6*mean')
    sfm_train[col] = modellrSFM[col].transform(combined['train'])
    sfm_test[col] = modellrSFM[col].transform(combined['test'])
    sfm[col] = LogisticRegression(solver='sag',C=3,max_iter=250,n_jobs=-1)
    sfm[col].fit(sfm_train[col],y_train[col].tolist())
    print(col,'fit')
    preds_sub[col] = sfm[col].predict_proba(sfm_test[col])[:,1]
    print(col,'predict')
    df_sub[col] = preds_sub[col]
    
df_sub.to_csv('df_sub_'+datetime.datetime.now().strftime('%Y%m%d%I%M')+'.csv',index=False)

In [None]:
modelxgb= {}
preds_sub={}
for col in initialcols:     

    modelxgb[col] = xgb.XGBClassifier(n_estimators=300, max_depth=5,objective= 'binary:logistic')
    modelxgb[col].fit(combined['train'],y_train[col].tolist())

    preds_sub[col] = modelxgb[col].predict_proba(combined['test'])[:,1]
    df_sub[col] = preds_sub[col]
    print(col,'done')

In [None]:
make_model(initialcols,test=False)

In [None]:
for i in vec.keys():
    pickle.dump(vec[i], open(i+'_vector.sav', 'wb'))
    pickle.dump(vec_test[i], open(i+'_test_vector.sav', 'wb'))

In [None]:
df_sub.to_csv('df_sub_'+datetime.datetime.now().strftime('%Y%m%d%I%M')+'.csv',index=False)

In [None]:
df_sub