In [45]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from scipy.sparse import hstack
from scipy.special import logit, expit

import pickle

In [46]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleaned_train.csv').fillna(' ')
test = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleaned_test.csv').fillna(' ')

list_sentences_train = train['comment_text']
list_sentences_test = test['comment_text']
all_text = pd.concat([list_sentences_train, list_sentences_test])

In [47]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww ! he matches this background colour i am...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cannot make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0


In [48]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then you wi...
1,0000247867823ef7,= = from rfc = = the title is fine as it is imo
2,00013b17ad220c46,= = sources = = * zawe ashton on lapland —
3,00017563c3f7919a,: if you have a look back at the source the in...
4,00017695ad8997eb,i do not anonymously edit articles at all


# Clean Dataset

In [49]:
import re

cl_path = '/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

def clean_word(text):
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)

    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    return text

train_text = []
test_text = []
for text in list_sentences_train:
    train_text.append(clean_word(text))
    
for text in list_sentences_test:
    test_text.append(clean_word(text))

# Apply TF-IDF Vectorizer

In [52]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=20000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

# Reduce Dimensions

# Train Models

# Applying Logistic Regression

In [93]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    max_features=30000)
char_vectorizer.fit(all_text)

train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [117]:
from sklearn.model_selection import cross_val_score, cross_val_predict

# train_features = hstack([train_char_features, train_word_features])
# test_features = hstack([test_char_features, test_word_features])

train_features = train_word_features
test_features = test_word_features

In [120]:
losses = []
log_predictions = {'id': test['id']}
log_models = {}
for class_name in class_names:
    train_target = train[class_name]
    log_classifier = LogisticRegression(solver='sag')
    log_classifier.fit(train_features, train_target)
    
    print('Accuracy of logistic regression classifier on {} set: {:.5f}'.format(class_name,log_classifier.score(train_features, train_target)))
    
    cv_loss = np.mean(cross_val_score(log_classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    log_models[class_name] = log_classifier
    log_predictions[class_name] = log_classifier.predict_proba(test_features)[:, 1]

Accuracy of logistic regression classifier on toxic set: 0.96310
CV score for class toxic is 0.9721319165264338
Accuracy of logistic regression classifier on severe_toxic set: 0.99130
CV score for class severe_toxic is 0.9850896749529038
Accuracy of logistic regression classifier on obscene set: 0.97991
CV score for class obscene is 0.9849032116241393
Accuracy of logistic regression classifier on threat set: 0.99738
CV score for class threat is 0.9865984459684927
Accuracy of logistic regression classifier on insult set: 0.97396
CV score for class insult is 0.9780563967444179
Accuracy of logistic regression classifier on identity_hate set: 0.99262
CV score for class identity_hate is 0.9755372847120424


In [121]:
len(log_models['toxic'].coef_[0])

20000

In [89]:
log_models

{'toxic': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='sag',
           tol=0.0001, verbose=0, warm_start=False),
 'severe_toxic': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='sag',
           tol=0.0001, verbose=0, warm_start=False),
 'obscene': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='sag',
           tol=0.0001, verbose=0, warm_start=False),
 'threat': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
      

In [123]:
#pickle the models
# Save Model as a pickle Using joblib
import pickle
from sklearn.externals import joblib
  
# Save the model as a pickle in a file 
joblib.dump(log_models, 'Logistic_Regression_models.p')
pickle.dump(train_char_features, open("train_char_features_vectorizer.p", "wb"))
pickle.dump(test_char_features, open("test_char_features_vectorizer.p", "wb"))
pickle.dump(word_vectorizer.fit(all_text), open("log_word_vectorizer.p", "wb"))

  
# Load the model from the file 
# pickled_models = joblib.load('models.p')  

In [78]:
import os
import gensim
# Setup nltk corpora path and Google Word2Vec location
google_vec_file = '/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/notebooks/GoogleNews-vectors-negative300.bin.gz'

In [79]:
model = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

In [81]:
model.most_similar('king' ,topn=4)

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204219460487366)]

In [80]:
model.n_similarity(['king', 'man'], ['queen', 'woman'])

0.6641711

In [22]:
submission = pd.DataFrame.from_dict(predictions)
submission.to_csv('Logistic-Submission.csv', index=False)

In [23]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999919,0.228113,0.999591,0.062074,0.983955,0.300168
1,0000247867823ef7,0.003444,0.001623,0.001824,0.000229,0.003144,0.00178
2,00013b17ad220c46,0.015389,0.003518,0.011505,0.000575,0.004145,0.001607
3,00017563c3f7919a,0.002364,0.001378,0.001815,0.00055,0.002073,0.00041
4,00017695ad8997eb,0.011967,0.001581,0.004789,0.000618,0.003887,0.000912


In [27]:
test.loc[test['id'].isin(['0000247867823ef7'])]


Unnamed: 0,id,comment_text
1,0000247867823ef7,= = from rfc = = the title is fine as it is imo


In [8]:
def _train_model(train_x, test_features):
    predictions = {'id': test['id']}
    for class_name in class_names:
        train_target = train[class_name]
        classifier = LogisticRegression(solver='sag')
        classifier.fit(train_X, train_y)
        predictions[class_name] = classifier.predict_proba(test_features)[:, 1]
    return predictions

def train_folds(X, y, fold_count, test_features):
    fold_size = len(X) // fold_count
    all_predections = []
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_size - 1:
            fold_end = len(X)

        train_x = np.concatenate([X[:fold_start], X[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])

        val_x = X[fold_start:fold_end]
        val_y = y[fold_start:fold_end]
    
        print("In fold #", fold_id)
        all_predections.append(_train_model(train_x, train_y))
    return all_predections

In [9]:
train_features.shape[0]

159571

In [44]:
test_features

<153164x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 131127897 stored elements in COOrdinate format>

In [43]:
# train_folds(train_features, test_features, train_features.shape[0])

# ExtraTreeClassifier

In [101]:
from sklearn.ensemble import ExtraTreesClassifier

losses = []
etc_predictions = {'id': test['id']}
etc_models = {}
for class_name in class_names:
    train_target = train[class_name]
    etc_classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_loss = np.mean(cross_val_score(etc_classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    etc_classifier.fit(train_features, train_target)
    etc_models[class_name] = classifier
    etc_predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

KeyboardInterrupt: 

In [91]:
etc_models

{'toxic': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='sag',
           tol=0.0001, verbose=0, warm_start=False),
 'severe_toxic': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='sag',
           tol=0.0001, verbose=0, warm_start=False),
 'obscene': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='sag',
           tol=0.0001, verbose=0, warm_start=False),
 'threat': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
      

In [38]:
#pickle the models
# Save Model as a pickle Using joblib
# Save the model as a pickle in a file 
joblib.dump(etc_models, 'etc_models.p') 
  
# Load the model from the file 
pickled_models = joblib.load('etc_models.p')  
  


In [45]:
pickled_models['toxic'].fit(train_features, train_target)
predictions['toxic'] = pickled_models['toxic'].predict_proba(test_features)[:, 1]

In [46]:
toxic = pickled_models['toxic']

In [48]:
cv_loss = np.mean(cross_val_score(toxic, train_features, train_target, cv=3, scoring='roc_auc'))
print('CV score for toxic class is {}'.format(cv_loss))

CV score for toxic class is 0.9134684084154682


In [56]:
predictions.keys()

dict_keys(['id', 'toxic'])