In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from scipy.sparse import hstack
from scipy.special import logit, expit

import re
import string
import pickle

import os
import gensim

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleaned_train.csv').fillna(' ')
test = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleaned_test.csv').fillna(' ')

In [3]:
#remove non-ascii characters
def remove_non_ascii(text):
    text  = "".join([char for char in text if char in string.printable])
    text = re.sub('[0-9]+', '', text)
    return text

train['comment_text'] = train['comment_text'].apply(lambda x: remove_non_ascii(x))
test['comment_text'] = test['comment_text'].apply(lambda x: remove_non_ascii(x))

In [4]:
list_sentences_train = train['comment_text']
list_sentences_test = test['comment_text']
all_text = pd.concat([list_sentences_train, list_sentences_test])

In [5]:
len(train)

159571

In [6]:
toxic = list_sentences_train.loc[train['toxic'] == 1]
toxic

6              cocksucker before you piss around on my work
12        hey what is it @ | talk what is it an exclusiv...
16        bye ! do not look come or think of comming bac...
42        you are gay or antisemmitian ? archangel white...
43                 fuck your filthy mother in the ass dry !
44        i am sorry i am sorry i screwed around with so...
51        get fucked up get fuckeeed up got a drink that...
55        stupid peace of shit stop deleting my stuff as...
56        = tony sidaway is obviously a fistfuckee he lo...
58        my band page deletion you thought i was gone d...
59        why cannot you believe how fat artie is ? did ...
65        all of my edits are good cunts like you who re...
86        would you both shut up you do not run wikipedi...
105             a pair of jew - hating weiner nazi schmucks
151       sorry puck but no one ever said dick was numbe...
159       unblock me or i will get my lawyers on to you ...
168       you should be fired you are a 

In [7]:
toxic = list_sentences_train.loc[train['toxic'] == 1]
severe_toxic = list_sentences_train.loc[train['toxic'] == 1]
obscene = list_sentences_train.loc[train['toxic'] == 1]
threat = list_sentences_train.loc[train['toxic'] == 1]
insult = list_sentences_train.loc[train['toxic'] == 1]
identity_hate = list_sentences_train.loc[train['toxic'] == 1]

# Clean Dataset

In [8]:

cl_path = '/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

def clean_word(text):
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)

    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"iâ€™m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    return text

train_text = []
test_text = []
toxic_text = []
severe_toxic_text = []
obscene_text = []
threat_text = []
insult_text = []
identity_hate_text = []

for text in list_sentences_train:
    train_text.append(clean_word(text))
    
for text in list_sentences_test:
    test_text.append(clean_word(text))
    
for text in toxic:
    toxic_text.append(clean_word(text))

for text in severe_toxic_text:
    severe_toxic_text.append(clean_word(text))

for text in obscene_text:
    obscene_text.append(clean_word(text))

for text in threat:
    threat_text.append(clean_word(text))

for text in insult:
    insult_text.append(clean_word(text))

for text in identity_hate:
    identity_hate_text.append(clean_word(text))

# Apply TF-IDF Vectorizer

In [9]:
#used for EFC
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
word_vectorizer.fit(all_text)

train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [10]:
#used for logit
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
char_vectorizer.fit(all_text)

train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

# Apply CountVectorizers

In [11]:
#used for logit
count_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
count_vec_fit = count_vectorizer.fit(all_text)

train_count_features = count_vectorizer.transform(train_text)
test_count_features = count_vectorizer.transform(test_text)

In [12]:
train_count_features.toarray().sum(axis=0)
count_df = pd.DataFrame(count_vec_fit.get_feature_names())
count_df['counts'] = train_count_features.toarray().sum(axis=0)

# CountVectorizers (for each individual topic/feature)

In [None]:
#toxic
toxic_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
toxic_vec_fit = toxic_vectorizer.fit(all_text)

toxic_count_features = toxic_vectorizer.transform(toxic_text)

In [None]:
toxic_count_df = pd.DataFrame(toxic_vec_fit.get_feature_names())
toxic_count_df['counts'] = toxic_count_features.toarray().sum(axis=0)

In [None]:
#severe_toxic
severe_toxic_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
severe_toxic_vec_fit = severe_toxic_vectorizer.fit(all_text)

severe_toxic_count_features = severe_toxic_vectorizer.transform(severe_toxic_text)

In [None]:
severe_toxic_count_df = pd.DataFrame(severe_toxic_vec_fit.get_feature_names())
severe_toxic_count_df['counts'] = severe_toxic_count_features.toarray().sum(axis=0)

In [None]:
#obscene
obscene_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
obscene_vec_fit = obscene_vectorizer.fit(all_text)

obscene_count_features = obscene_vectorizer.transform(obscene_text)

In [None]:
obscene_count_df = pd.DataFrame(obscene_vec_fit.get_feature_names())
obscene_count_df['counts'] = obscene_count_features.toarray().sum(axis=0)

In [None]:
#threat
threat_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
threat_vec_fit = threat_vectorizer.fit(all_text)

threat_count_features = threat_vectorizer.transform(threat_text)

In [None]:
threat_count_df = pd.DataFrame(threat_vec_fit.get_feature_names())
threat_count_df['counts'] = threat_count_features.toarray().sum(axis=0)

In [None]:
#insult
insult_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
insult_vec_fit = insult_vectorizer.fit(all_text)

insult_count_features = insult_vectorizer.transform(insult_text)

In [None]:
insult_count_df = pd.DataFrame(insult_vec_fit.get_feature_names())
insult_count_df['counts'] = insult_count_features.toarray().sum(axis=0)

In [None]:
#identity_hate
identity_hate_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
identity_hate_vec_fit = identity_hate_vectorizer.fit(all_text)

identity_hate_count_features = identity_hate_vectorizer.transform(identity_hate_text)

In [None]:
identity_hate_count_df = pd.DataFrame(identity_hate_vec_fit.get_feature_names())
identity_hate_count_df['counts'] = identity_hate_count_features.toarray().sum(axis=0)

In [13]:
def create_df_of_toxic_category(category_string):
    '''category string must match train dataframe column name exactly ''' 
    filtered_sentences = list_sentences_train.loc[train[category_string] == 1]
    category_vectorizer = CountVectorizer(
        strip_accents='unicode',
        analyzer='word',
        ngram_range=(1, 1),
        stop_words='english',
        max_features=30000)
    category_vec_fit = category_vectorizer.fit(filtered_sentences)

    category_count_features = category_vectorizer.transform(filtered_sentences)
    category_count_df = pd.DataFrame(category_vec_fit.get_feature_names())
    category_count_df['counts'] = category_count_features.toarray().sum(axis=0)
    category_count_df.rename(columns = {0: 'word'})
    return category_count_df, filtered_sentences

In [14]:
class_specific_vocab_dict = dict()
class_specific_sentences = dict()
for class_name in class_names:
    class_specific_vocab_dict[class_name] = \
        create_df_of_toxic_category(class_name)[0].sort_values('counts', ascending = False)
    class_specific_sentences[class_name] =\
            create_df_of_toxic_category(class_name)[1]

In [15]:
toxic_string = ''
for line in class_specific_sentences['toxic']:
    toxic_string+=' ' + line

In [None]:
word_vectorizer.get_feature_names()

In [None]:
list(word_vectorizer.transform([toxic_string]).toarray()[0] == word_vectorizer.transform([toxic_string]).toarray().max()).index(1)

In [None]:
word_vectorizer.get_feature_names()[10704]

In [None]:
toxic_sentences_transformed = word_vectorizer.transform(class_specific_sentences['toxic']).toarray()

In [None]:
toxic_sentences_transformed

In [None]:
toxic_sentences_transformed.sum(axis=0).max()

In [None]:
class_specific_vocab_dict['identity_hate'].head()

In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
toxic_df = create_df_of_toxic_category('toxic')

In [None]:
toxic_df.sort_values('counts',ascending = False)

In [None]:
toxic_df.rename(columns = {0: 'word'})

In [None]:
word_vectorizer.transform

# Reduce Dimensions

# Train Models

# Applying Logistic Regression

In [16]:
#this is kept for heroku purposes
train_features = train_word_features
test_features = test_word_features

In [17]:
losses = []
log_predictions = {'id': test['id']}
log_models = {}
for class_name in class_names:
    train_target = train[class_name]
    log_classifier = LogisticRegression(solver='sag')
    log_classifier.fit(train_features, train_target)
    
    print('Accuracy of logistic regression classifier on {} set: {:.5f}'.format(class_name,log_classifier.score(train_features, train_target)))
    
    cv_loss = np.mean(cross_val_score(log_classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    log_models[class_name] = log_classifier
    log_predictions[class_name] = log_classifier.predict_proba(test_features)[:, 1]
    
    

Accuracy of logistic regression classifier on toxic set: 0.96202
CV score for class toxic is 0.9702816984672603
Accuracy of logistic regression classifier on severe_toxic set: 0.99104
CV score for class severe_toxic is 0.9857997215697615
Accuracy of logistic regression classifier on obscene set: 0.98003
CV score for class obscene is 0.9859225066889902
Accuracy of logistic regression classifier on threat set: 0.99729
CV score for class threat is 0.9823470849859306
Accuracy of logistic regression classifier on insult set: 0.97318
CV score for class insult is 0.9769461122942081
Accuracy of logistic regression classifier on identity_hate set: 0.99240
CV score for class identity_hate is 0.9749884558642069


In [19]:
import scikitplot as skplt
import matplotlib.pyplot as plt

y_true = losses# ground truth labels
y_probas = log_predictions# predicted probabilities generated by sklearn classifier
skplt.metrics.plot_roc_curve(y_true, y_probas)
plt.show()



IndexError: too many indices for array

In [None]:
log_predictions

In [None]:
#pickle the models
# Save Model as a pickle Using joblib
import pickle
from sklearn.externals import joblib
  
# Save the model as a pickle in a file 
joblib.dump(log_models, 'Logistic_Regression_models.p')
pickle.dump(train_char_features, open("train_char_features_vectorizer.p", "wb"))
pickle.dump(test_char_features, open("test_char_features_vectorizer.p", "wb"))
pickle.dump(word_vectorizer.fit(all_text), open("log_word_vectorizer.p", "wb"))

  
# Load the model from the file 
# pickled_models = joblib.load('models.p')

# ExtraTreeClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

losses = []
etc_predictions = {'id': test['id']}
etc_models = {}
for class_name in class_names:
    train_target = train[class_name]
    etc_classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_loss = np.mean(cross_val_score(etc_classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    etc_classifier.fit(train_features, train_target)
    etc_models[class_name] = etc_classifier
    etc_predictions[class_name] = etc_classifier.predict_proba(test_features)[:, 1]

In [None]:
#pickle the models
# Save Model as a pickle Using joblib
# Save the model as a pickle in a file 
joblib.dump(etc_models, 'etc_models.p') 
  
# Load the model from the file 
pickled_models = joblib.load('etc_models.p')  
  


In [None]:
pickled_models['toxic'].fit(train_features, train_target)
predictions['toxic'] = pickled_models['toxic'].predict_proba(test_features)[:, 1]

In [None]:
toxic = pickled_models['toxic']

In [None]:
cv_loss = np.mean(cross_val_score(toxic, train_features, train_target, cv=3, scoring='roc_auc'))
print('CV score for toxic class is {}'.format(cv_loss))

In [None]:
predictions.keys()

# Additional Code (Unused)

In [None]:
# def _train_model(train_x, test_features):
#     predictions = {'id': test['id']}
#     for class_name in class_names:
#         train_target = train[class_name]
#         classifier = LogisticRegression(solver='sag')
#         classifier.fit(train_X, train_y)
#         predictions[class_name] = classifier.predict_proba(test_features)[:, 1]
#     return predictions

# def train_folds(X, y, fold_count, test_features):
#     fold_size = len(X) // fold_count
#     all_predections = []
#     for fold_id in range(0, fold_count):
#         fold_start = fold_size * fold_id
#         fold_end = fold_start + fold_size

#         if fold_id == fold_size - 1:
#             fold_end = len(X)

#         train_x = np.concatenate([X[:fold_start], X[fold_end:]])
#         train_y = np.concatenate([y[:fold_start], y[fold_end:]])

#         val_x = X[fold_start:fold_end]
#         val_y = y[fold_start:fold_end]
    
#         print("In fold #", fold_id)
#         all_predections.append(_train_model(train_x, train_y))
#     return all_predections

# train_folds(train_features, test_features, train_features.shape[0])

In [None]:
# submission = pd.DataFrame.from_dict(predictions)
# submission.to_csv('Logistic-Submission.csv', index=False)

In [None]:
# Setup nltk corpora path and Google Word2Vec location
google_vec_file = '/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/notebooks/GoogleNews-vectors-negative300.bin.gz'

In [None]:
model.most_similar('king' ,topn=4)

In [None]:
model.n_similarity(['king', 'man'], ['queen', 'woman'])