In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from scipy.sparse import hstack
from scipy.special import logit, expit

import pickle

# Initiate Class Names and Open Files

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleaned_train.csv').fillna(' ')
test = pickle.load(open('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/chat_logs/callofduty_chat.p','rb'))

In [3]:
ar = np.array(range(0,len(test)))

In [4]:
test['id'] = ar

In [18]:
test = test[0:100]

In [6]:
list_sentences_train = train['comment_text']
list_sentences_test = test['message']

In [7]:
all_text = pd.concat([list_sentences_train, list_sentences_test])

# Clean Dataset

In [8]:
import re

cl_path = '/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

def clean_word(text):
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)

    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"iâ€™m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    return text

train_text = []
test_text = []
for text in list_sentences_train:
    train_text.append(clean_word(text))
    
for text in list_sentences_test:
    test_text.append(clean_word(text))

# Apply TF-IDF Vectorizer

In [9]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=50000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

# Train Models

# ExtraTreeClassifier

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

losses = []
predictions = {'id': test['id']}
models = {}
for class_name in class_names:
    train_target = train[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30)

    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    classifier.fit(train_features, train_target)
    models[class_name] = classifier
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

CV score for class toxic is 0.9533489165216545
CV score for class severe_toxic is 0.9407712280477519
CV score for class obscene is 0.9780868191286545
CV score for class threat is 0.8594060404950042
CV score for class insult is 0.9618170899384982
CV score for class identity_hate is 0.9042010302886488


In [69]:
test_features.toarray()

array([[0.08864541, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.06752337, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.06550807, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06367916, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.08214958, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.07227823, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [63]:
check = [['message','id','toxic','severe_toxic','obscene','threat','insult','identity_hate']]

for item in zip(*predictions.values()):
    check.append([list_sentences_test[item[0]],item[0],item[1],item[2],item[3],item[4],item[5],item[6]])

In [64]:
x = pd.DataFrame(check)

In [65]:
new_header = x.iloc[0] #grab the first row for the header
x = x[1:] #take the data less the header row
x.columns = new_header #set the header row as the df header

In [70]:
x.shape

(1208, 8)

In [48]:
x.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')

In [71]:
models

{'toxic': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'severe_toxic': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'obscene': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_node

In [17]:
predictions.keys()

dict_keys(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [72]:
#pickle the models
# Save Model as a pickle Using joblib
import pickle
from sklearn.externals import joblib

# Save the model as a pickle in a file 
joblib.dump(models, 'twitch_models.p') 

# Load the model from the file 
pickled_models = joblib.load('twitch_models.p')  
  


In [None]:
pickled_models['toxic'].fit(train_features, train_target)
predictions['toxic'] = pickled_models['toxic'].predict_proba(test_features)[:, 1]

In [None]:
toxic = pickled_models['toxic']

In [None]:
cv_loss = np.mean(cross_val_score(toxic, train_features, train_target, cv=3, scoring='roc_auc'))
print('CV score for toxic class is {}'.format(cv_loss))