In [35]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from scipy.sparse import hstack
from scipy.special import logit, expit

import pickle

In [21]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleaned_train.csv').fillna(' ')
test = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleaned_test.csv').fillna(' ')

list_sentences_train = train['comment_text']
list_sentences_test = test['comment_text']
all_text = pd.concat([list_sentences_train, list_sentences_test])

# Clean Dataset

In [2]:
import re

cl_path = '/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

def clean_word(text):
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)

    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    return text

train_text = []
test_text = []
for text in list_sentences_train:
    train_text.append(clean_word(text))
    
for text in list_sentences_test:
    test_text.append(clean_word(text))

In [26]:
all_text.head(20)

0     explanation why the edits made under my userna...
1     d aww ! he matches this background colour i am...
2     hey man i am really not trying to edit war it ...
3     more i cannot make any real suggestions on imp...
4     you sir are my hero any chance you remember wh...
5     congratulations from me as well use the tools ...
6          cocksucker before you piss around on my work
7     your vandalism to the matt shirvington article...
8     sorry if the word nonsense was offensive to yo...
9     alignment on this subject and which are contra...
10    fair use rationale for thanks for uploading i ...
11    bbq be a man and lets discuss it - maybe over ...
12    hey what is it @ | talk what is it an exclusiv...
13    before you start throwing accusations and warn...
14    oh and the girl above started her arguments wi...
15    juelz santanas age in juelz santana was years ...
16    bye ! do not look come or think of comming bac...
17    redirect talk : voydan pop georgiev - cher

# Apply TF-IDF Vectorizer

In [3]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=20000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 6),
    max_features=30000)
char_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [4]:
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

# Train Models

# Applying Logistic Regression

In [5]:
# from sklearn.model_selection import cross_val_score, cross_val_predict

# train_features = hstack([train_char_features, train_word_features])
# test_features = hstack([test_char_features, test_word_features])

# losses = []
# predictions = {'id': test['id']}
# for class_name in class_names:
#     train_target = train[class_name]
#     classifier = LogisticRegression(solver='sag')
#     classifier.fit(train_features, train_target)Zip 
#     predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

In [22]:
submission = pd.DataFrame.from_dict(predictions)
submission.to_csv('Logistic-Submission.csv', index=False)

In [23]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999919,0.228113,0.999591,0.062074,0.983955,0.300168
1,0000247867823ef7,0.003444,0.001623,0.001824,0.000229,0.003144,0.00178
2,00013b17ad220c46,0.015389,0.003518,0.011505,0.000575,0.004145,0.001607
3,00017563c3f7919a,0.002364,0.001378,0.001815,0.00055,0.002073,0.00041
4,00017695ad8997eb,0.011967,0.001581,0.004789,0.000618,0.003887,0.000912


In [27]:
test.loc[test['id'].isin(['0000247867823ef7'])]


Unnamed: 0,id,comment_text
1,0000247867823ef7,= = from rfc = = the title is fine as it is imo


In [18]:
train.head(1)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0


In [8]:
def _train_model(train_x, test_features):
    predictions = {'id': test['id']}
    for class_name in class_names:
        train_target = train[class_name]
        classifier = LogisticRegression(solver='sag')
        classifier.fit(train_X, train_y)
        predictions[class_name] = classifier.predict_proba(test_features)[:, 1]
    return predictions

def train_folds(X, y, fold_count, test_features):
    fold_size = len(X) // fold_count
    all_predections = []
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_size - 1:
            fold_end = len(X)

        train_x = np.concatenate([X[:fold_start], X[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])

        val_x = X[fold_start:fold_end]
        val_y = y[fold_start:fold_end]
    
        print("In fold #", fold_id)
        all_predections.append(_train_model(train_x, train_y))
    return all_predections

In [9]:
train_features.shape[0]

159571

In [44]:
test_features

<153164x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 131127897 stored elements in COOrdinate format>

In [43]:
# train_folds(train_features, test_features, train_features.shape[0])

# ExtraTreeClassifier

In [32]:
# from sklearn.ensemble import ExtraTreesClassifier

# losses = []
# predictions = {'id': test['id']}
# models = {}
# for class_name in class_names:
#     train_target = train[class_name]
#     classifier = ExtraTreesClassifier(n_estimators=30)
    
#     cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
#     losses.append(cv_loss)
#     print('CV score for class {} is {}'.format(class_name, cv_loss))
    
#     classifier.fit(train_features, train_target)
#     models[class_name] = classifier
#     predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

CV score for class toxic is 0.9551495044926335
CV score for class severe_toxic is 0.9497141653645179
CV score for class obscene is 0.9809746042919114
CV score for class threat is 0.8694049037691535
CV score for class insult is 0.9622948947904963
CV score for class identity_hate is 0.9146643199604466


In [34]:
models

{'toxic': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'severe_toxic': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'obscene': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_node

In [38]:
#pickle the models
# Save Model as a pickle Using joblib
import pickle
from sklearn.externals import joblib
  
# Save the model as a pickle in a file 
joblib.dump(models, 'models.p') 
  
# Load the model from the file 
pickled_models = joblib.load('models.p')  
  


In [45]:
pickled_models['toxic'].fit(train_features, train_target)
predictions['toxic'] = pickled_models['toxic'].predict_proba(test_features)[:, 1]

In [46]:
toxic = pickled_models['toxic']

In [None]:
cv_loss = np.mean(cross_val_score(toxic, train_features, train_target, cv=3, scoring='roc_auc'))
print('CV score for toxic class is {}'.format(cv_loss))