In [59]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from scipy.sparse import hstack
from scipy.special import logit, expit

import re
import string
import pickle

import os
import gensim

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleaned_train.csv').fillna(' ')
test = pd.read_csv('/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleaned_test.csv').fillna(' ')

In [3]:
#remove non-ascii characters
def remove_non_ascii(text):
    text  = "".join([char for char in text if char in string.printable])
    text = re.sub('[0-9]+', '', text)
    return text

train['comment_text'] = train['comment_text'].apply(lambda x: remove_non_ascii(x))
test['comment_text'] = test['comment_text'].apply(lambda x: remove_non_ascii(x))

In [4]:
list_sentences_train = train['comment_text']
list_sentences_test = test['comment_text']
all_text = pd.concat([list_sentences_train, list_sentences_test])

In [5]:
len(train)

159571

In [7]:
toxic = list_sentences_train.loc[train['toxic'] == 1]
severe_toxic = list_sentences_train.loc[train['toxic'] == 1]
obscene = list_sentences_train.loc[train['toxic'] == 1]
threat = list_sentences_train.loc[train['toxic'] == 1]
insult = list_sentences_train.loc[train['toxic'] == 1]
identity_hate = list_sentences_train.loc[train['toxic'] == 1]

# Clean Dataset

In [9]:

cl_path = '/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/toxic_comment_classification/jigsaw-toxic-comment-classification-challenge/cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

def clean_word(text):
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)

    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    return text

train_text = []
test_text = []
toxic_text = []
severe_toxic_text = []
obscene_text = []
threat_text = []
insult_text = []
identity_hate_text = []

for text in list_sentences_train:
    train_text.append(clean_word(text))
    
for text in list_sentences_test:
    test_text.append(clean_word(text))
    
for text in toxic:
    toxic_text.append(clean_word(text))

for text in severe_toxic_text:
    severe_toxic_text.append(clean_word(text))

for text in obscene_text:
    obscene_text.append(clean_word(text))

for text in threat:
    threat_text.append(clean_word(text))

for text in insult:
    insult_text.append(clean_word(text))

for text in identity_hate:
    identity_hate_text.append(clean_word(text))

# Apply TF-IDF Vectorizer

In [15]:
#used for EFC
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
word_vectorizer.fit(all_text)

train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [64]:
#ties the TF-IDF keys to the values
for item in zip(list(word_vectorizer.vocabulary_.keys())[:10],list(word_vectorizer.vocabulary_.values())[:10]):
    print(item)

('explanation', 9455)
('edits', 8445)
('username', 28238)
('hardcore', 11961)
('metallica', 16749)
('fan', 9724)
('reverted', 22598)
('vandalisms', 28368)
('just', 14495)
('closure', 4861)


In [10]:
#used for logit
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
char_vectorizer.fit(all_text)

train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [73]:
char_vectorizer.get_params

<bound method BaseEstimator.get_params of TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)>

# Apply CountVectorizers

In [None]:
#used for logit
count_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
count_vec_fit = count_vectorizer.fit(all_text)

train_count_features = count_vectorizer.transform(train_text)
test_count_features = count_vectorizer.transform(test_text)

In [None]:
train_count_features.toarray().sum(axis=0)
count_df = pd.DataFrame(count_vec_fit.get_feature_names())
count_df['counts'] = train_count_features.toarray().sum(axis=0)

# CountVectorizers (for each individual topic/feature)

In [None]:
#toxic
toxic_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
toxic_vec_fit = toxic_vectorizer.fit(all_text)

toxic_count_features = toxic_vectorizer.transform(toxic_text)

In [None]:
toxic_count_df = pd.DataFrame(toxic_vec_fit.get_feature_names())
toxic_count_df['counts'] = toxic_count_features.toarray().sum(axis=0)

In [None]:
#severe_toxic
severe_toxic_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
severe_toxic_vec_fit = severe_toxic_vectorizer.fit(all_text)

severe_toxic_count_features = severe_toxic_vectorizer.transform(severe_toxic_text)

In [None]:
severe_toxic_count_df = pd.DataFrame(severe_toxic_vec_fit.get_feature_names())
severe_toxic_count_df['counts'] = severe_toxic_count_features.toarray().sum(axis=0)

In [None]:
#obscene
obscene_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
obscene_vec_fit = obscene_vectorizer.fit(all_text)

obscene_count_features = obscene_vectorizer.transform(obscene_text)

In [None]:
obscene_count_df = pd.DataFrame(obscene_vec_fit.get_feature_names())
obscene_count_df['counts'] = obscene_count_features.toarray().sum(axis=0)

In [None]:
#threat
threat_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
threat_vec_fit = threat_vectorizer.fit(all_text)

threat_count_features = threat_vectorizer.transform(threat_text)

In [None]:
threat_count_df = pd.DataFrame(threat_vec_fit.get_feature_names())
threat_count_df['counts'] = threat_count_features.toarray().sum(axis=0)

In [None]:
#insult
insult_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
insult_vec_fit = insult_vectorizer.fit(all_text)

insult_count_features = insult_vectorizer.transform(insult_text)

In [None]:
insult_count_df = pd.DataFrame(insult_vec_fit.get_feature_names())
insult_count_df['counts'] = insult_count_features.toarray().sum(axis=0)

In [None]:
#identity_hate
identity_hate_vectorizer = CountVectorizer(
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=30000)
identity_hate_vec_fit = identity_hate_vectorizer.fit(all_text)

identity_hate_count_features = identity_hate_vectorizer.transform(identity_hate_text)

In [None]:
identity_hate_count_df = pd.DataFrame(identity_hate_vec_fit.get_feature_names())
identity_hate_count_df['counts'] = identity_hate_count_features.toarray().sum(axis=0)

In [11]:
def create_df_of_toxic_category(category_string):
    '''category string must match train dataframe column name exactly ''' 
    filtered_sentences = list_sentences_train.loc[train[category_string] == 1]
    category_vectorizer = CountVectorizer(
        strip_accents='unicode',
        analyzer='word',
        ngram_range=(1, 1),
        stop_words='english',
        max_features=30000)
    category_vec_fit = category_vectorizer.fit(filtered_sentences)

    category_count_features = category_vectorizer.transform(filtered_sentences)
    category_count_df = pd.DataFrame(category_vec_fit.get_feature_names())
    category_count_df['counts'] = category_count_features.toarray().sum(axis=0)
    category_count_df.rename(columns = {0: 'word'})
    return category_count_df, filtered_sentences

In [12]:
class_specific_vocab_dict = dict()
class_specific_sentences = dict()
for class_name in class_names:
    class_specific_vocab_dict[class_name] = \
        create_df_of_toxic_category(class_name)[0].sort_values('counts', ascending = False)
    class_specific_sentences[class_name] =\
            create_df_of_toxic_category(class_name)[1]

In [13]:
toxic_string = ''
for line in class_specific_sentences['toxic']:
    toxic_string+=' ' + line

In [17]:
word_vectorizer.get_feature_names()

['_',
 '__',
 '___',
 '____',
 '_____',
 '______',
 '__toc__',
 '_id',
 '_l',
 '_noticeboard',
 'aa',
 'aaa',
 'aaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
 'aad',
 'aah',
 'aahahahahahahjaahahahahahahaahh',
 'aaliyah',
 'aan',
 'aap',
 'aardvark',
 'aaron',
 'aas',
 'aave',
 'ab',
 'aba',
 'aback',
 'abad',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abba',
 'abbas',
 'abbey',
 'abbot',
 'abbott',
 'abbrev',
 'abbreviate',
 'abbreviated',
 'abbreviation',
 'abbreviations',
 'abby',
 'abc',
 'abd',
 'abdel',
 'abdelkader',
 'abducted',
 'abduction',
 'abdul',
 'abdullah',
 'abe',
 'abeed',
 'abel',
 'aber',
 'abf',
 'abhira',
 'abhiras',
 'abhishek',
 'abhor',
 'abhorrent',
 'abi',
 'abide',
 'abiding',
 'abigail',
 'abilities',
 'ability',
 'abiogenesis',
 'abit',
 'abject',
 'abkhazia',
 'able',
 'abnormal',
 'aboard',
 'abode',
 'abolish',
 'abolished',
 'abolition',
 'abolitionist',
 'abomination',
 'aboriginal',
 'aborigines',
 'aborted',
 'abortion',
 'abo

In [16]:
list(word_vectorizer.transform([toxic_string]).toarray()[0] == word_vectorizer.transform([toxic_string]).toarray().max()).index(1)

10704

In [18]:
word_vectorizer.get_feature_names()[10704]

'fucksex'

In [19]:
toxic_sentences_transformed = word_vectorizer.transform(class_specific_sentences['toxic']).toarray()

In [20]:
toxic_sentences_transformed

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
toxic_sentences_transformed.sum(axis=0).max()

743.4650972457656

In [22]:
class_specific_vocab_dict['identity_hate'].head()

Unnamed: 0,0,counts
4226,nigger,2969
2225,fat,1322
3328,jew,1315
2535,gay,918
2443,fuck,880


In [23]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [24]:
toxic_df = create_df_of_toxic_category('toxic')

In [25]:
toxic_df.sort_values('counts',ascending = False)

AttributeError: 'tuple' object has no attribute 'sort_values'

In [26]:
toxic_df.rename(columns = {0: 'word'})

AttributeError: 'tuple' object has no attribute 'rename'

In [27]:
word_vectorizer.transform

<bound method TfidfVectorizer.transform of TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)>

# Reduce Dimensions

# Train Models

In [None]:
from sklearn.model_selection import train_test_split
X_break, X_test, y_break, y_test = train_test_split(no_fraud_result, result_df['isFraud'], \
                                                    test_size=0.2, stratify=result_df['isFraud'],
                                                    random_state=42)
#random_state generates a set kind of deck for each random state. 41 will always be the same kind of split

# X_train, X_val, y_train, y_val = train_test_split(X_break, y_break, \
#                                                     test_size=0.25, stratify=y_break,
#                                                     random_state=42)

In [57]:
train['toxic']

0         0
1         0
2         0
3         0
4         0
5         0
6         1
7         0
8         0
9         0
10        0
11        0
12        1
13        0
14        0
15        0
16        1
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
159541    1
159542    0
159543    0
159544    0
159545    0
159546    1
159547    0
159548    0
159549    0
159550    0
159551    0
159552    0
159553    0
159554    1
159555    0
159556    0
159557    0
159558    0
159559    0
159560    0
159561    0
159562    0
159563    0
159564    0
159565    0
159566    0
159567    0
159568    0
159569    0
159570    0
Name: toxic, Length: 159571, dtype: int64

In [50]:
train_features

<159571x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 3645794 stored elements in Compressed Sparse Row format>

# Applying Logistic Regression

In [58]:
#this is kept for heroku purposes
train_features = train_word_features
test_features = test_word_features

In [56]:
losses

[0.971021009852267,
 0.9857997393949715,
 0.9858932794065138,
 0.9834772645391503,
 0.9769462290631367,
 0.9748322041596338]

In [30]:
losses = []
log_predictions = {}
log_predictions = {'id': test['id']}
log_models = {}
for class_name in class_names:
    train_target = train[class_name]
    log_classifier = LogisticRegression(solver='sag')
    log_classifier.fit(train_features, train_target)
    
    print('Accuracy of logistic regression classifier on {} set: {:.5f}'.format(class_name,log_classifier.score(train_features, train_target)))
    
    cv_loss = np.mean(cross_val_score(log_classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    log_models[class_name] = log_classifier
    log_predictions[class_name] = log_classifier.predict_proba(test_features)[:, 1]


Accuracy of logistic regression classifier on toxic set: 0.96202
CV score for class toxic is 0.9702816821525904
Accuracy of logistic regression classifier on severe_toxic set: 0.99104
CV score for class severe_toxic is 0.9857997691866469
Accuracy of logistic regression classifier on obscene set: 0.98003
CV score for class obscene is 0.9859225536450635
Accuracy of logistic regression classifier on threat set: 0.99729
CV score for class threat is 0.9823473609706234
Accuracy of logistic regression classifier on insult set: 0.97317
CV score for class insult is 0.9769462152486753
Accuracy of logistic regression classifier on identity_hate set: 0.99240
CV score for class identity_hate is 0.9749884284130234


In [39]:
# Run CV with 5 folds (logit)
from sklearn import linear_model

from sklearn.model_selection import GridSearchCV

losses = []
log_predictions = {}
log_predictions = {'id': test['id']}
log_models = {}
penalty = ['l2']
C = np.logspace(0, 4, 10)
param_grid = dict(C=C, penalty=penalty)

for class_name in class_names:
    train_target = train[class_name]
    logistic = LogisticRegression(solver='sag')
    logistic_grid = GridSearchCV(logistic, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
    logistic_grid.fit(train_features, train_target)
#     log_classifier = LogisticRegression(solver='sag')
#     log_classifier.fit(train_features, train_target)
    
    print('Accuracy of logistic regression classifier on {} set: {:.5f}'.format(class_name,logistic_grid.score(train_features, train_target)))
    
    cv_loss = np.mean(cross_val_score(logistic_grid, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    log_models[class_name] = log_classifier
    log_predictions[class_name] = logistic_grid.predict_proba(test_features)[:, 1]

Accuracy of logistic regression classifier on toxic set: 0.98966
CV score for class toxic is 0.971021009852267
Accuracy of logistic regression classifier on severe_toxic set: 0.99242
CV score for class severe_toxic is 0.9857997393949715
Accuracy of logistic regression classifier on obscene set: 0.99357
CV score for class obscene is 0.9858932794065138
Accuracy of logistic regression classifier on threat set: 0.99838
CV score for class threat is 0.9834772645391503
Accuracy of logistic regression classifier on insult set: 0.98785
CV score for class insult is 0.9769462290631367
Accuracy of logistic regression classifier on identity_hate set: 0.99028
CV score for class identity_hate is 0.9748322041596338


In [52]:
log_predictions['toxic']

array([0.99833293, 0.00356479, 0.02498918, ..., 0.00510029, 0.01680948,
       0.9965585 ])

In [49]:
log_models['toxic']

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [55]:
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=42)
# fit a model
model = KNeighborsClassifier(n_neighbors=3)
model.fit(trainX, trainy)
# predict probabilities
probs = model.predict_proba(testX)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(testy, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(testy, probs)
# plot no skill
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
pyplot.plot(fpr, tpr, marker='.')
# show the plot
pyplot.show()

TypeError: len() of unsized object

In [45]:
import matplotlib.pyplot as plt

In [46]:
model_list = [log_models['toxic'],
              log_models['severe_toxic'],
              log_models['obscene'],
              log_models['threat'],
              log_models['insult'],
              log_models['identity_hate']
#               logistic_grid.best_estimator_, 
#               svm_grid.best_estimator_, 
#               gnb_best,
#               rf_random.best_estimator_
#               knn_grid.best_estimator_,
#               CV_rfc.best_estimator_,
#              dc
             ]

model_name = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] 
 
# Plot ROC curve for all my models
fig, ax = plt.subplots(figsize=(10,8))
for i, model in enumerate(model_list):
    y_pred = list(model.predict_proba(X_val)[:,1])
    fpr, tpr, threshold = metrics.roc_curve(y_val, y_pred)
    roc_auc = metrics.auc(fpr, tpr)
    plt.plot(fpr, tpr, label = (model_name[i] + ' AUC = %0.4f' % roc_auc),linewidth=1.0)

plt.legend(loc = 'lower right')
plt.title('Receiver Operating Characteristic')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

NameError: name 'gnb_best' is not defined

In [None]:
#pickle the models
# Save Model as a pickle Using joblib
import pickle
from sklearn.externals import joblib
  
# Save the model as a pickle in a file 
joblib.dump(log_models, 'Logistic_Regression_models.p')
pickle.dump(train_char_features, open("train_char_features_vectorizer.p", "wb"))
pickle.dump(test_char_features, open("test_char_features_vectorizer.p", "wb"))
pickle.dump(word_vectorizer.fit(all_text), open("log_word_vectorizer.p", "wb"))

  
# Load the model from the file 
# pickled_models = joblib.load('models.p')

# ExtraTreeClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

losses = []
etc_predictions = {'id': test['id']}
etc_models = {}
for class_name in class_names:
    train_target = train[class_name]
    etc_classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_loss = np.mean(cross_val_score(etc_classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    etc_classifier.fit(train_features, train_target)
    etc_models[class_name] = etc_classifier
    etc_predictions[class_name] = etc_classifier.predict_proba(test_features)[:, 1]

In [None]:
#pickle the models
# Save Model as a pickle Using joblib
# Save the model as a pickle in a file 
joblib.dump(etc_models, 'etc_models.p') 
  
# Load the model from the file 
pickled_models = joblib.load('etc_models.p')  
  


In [None]:
pickled_models['toxic'].fit(train_features, train_target)
predictions['toxic'] = pickled_models['toxic'].predict_proba(test_features)[:, 1]

# Additional Code (Unused)

In [None]:
# def _train_model(train_x, test_features):
#     predictions = {'id': test['id']}
#     for class_name in class_names:
#         train_target = train[class_name]
#         classifier = LogisticRegression(solver='sag')
#         classifier.fit(train_X, train_y)
#         predictions[class_name] = classifier.predict_proba(test_features)[:, 1]
#     return predictions

# def train_folds(X, y, fold_count, test_features):
#     fold_size = len(X) // fold_count
#     all_predections = []
#     for fold_id in range(0, fold_count):
#         fold_start = fold_size * fold_id
#         fold_end = fold_start + fold_size

#         if fold_id == fold_size - 1:
#             fold_end = len(X)

#         train_x = np.concatenate([X[:fold_start], X[fold_end:]])
#         train_y = np.concatenate([y[:fold_start], y[fold_end:]])

#         val_x = X[fold_start:fold_end]
#         val_y = y[fold_start:fold_end]
    
#         print("In fold #", fold_id)
#         all_predections.append(_train_model(train_x, train_y))
#     return all_predections

# train_folds(train_features, test_features, train_features.shape[0])

In [None]:
# submission = pd.DataFrame.from_dict(predictions)
# submission.to_csv('Logistic-Submission.csv', index=False)

In [None]:
# Setup nltk corpora path and Google Word2Vec location
google_vec_file = '/Users/randy/Documents/GitHub/Twitch_Chat_Harassment/notebooks/GoogleNews-vectors-negative300.bin.gz'

In [None]:
model.most_similar('king' ,topn=4)

In [None]:
model.n_similarity(['king', 'man'], ['queen', 'woman'])