In [None]:
# Nikita Bawane, Ritu Gangwal, Utkarsh Ujwal

# Importing libraries
import pandas as pd

#Basic
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import display, HTML

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

! pip install -U gensim
import gensim
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB

import string 
punctuation_set = set(string.punctuation)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from wordcloud import WordCloud
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score
from statistics import mean

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

! pip install plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout



### Loading the data

In [None]:
#Loading the training data
train = pd.read_csv(r'C:\Users\ritu2\Desktop\UIC MSBA\Sem 2\Text Analytics\Project\Code\train.csv')

# loading the test data
test = pd.read_csv(r'C:\Users\ritu2\Desktop\UIC MSBA\Sem 2\Text Analytics\Project\Code\test.csv')
test.head()

train.head()

**##########################################################################**

We see that their are 6 levels of toxicity in the training dataset:

Toxic, Severe toxic, Obscene, Threat, Insult, Identity Hate

These seem to be in the increasing level of toxicity


Let's look at some random comments from each toxicity class.


In [None]:
toxic_label = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for toxicity in toxic_label:
    rand_text = np.random.choice(train[train[toxicity]==1].index,size=1)[0] # Select a random comment from each class
    print('Let us see a comment of {} text\n'.format(toxicity), train.iloc[rand_text,1],'\n')

No. of toxic and non-toxic comments

In [None]:
count_labels = train.select_dtypes(include=np.number).sum(axis=0)
total_toxic_comments = train.select_dtypes(include=np.number).apply(lambda row: any(row) == 1, axis=1).sum()
print('Total number of Toxic texts = {}, out of {}'.format(total_toxic_comments, train.shape[0]))
print('Total number of Non-Toxic comments: {}, out of {}'.format(train.shape[0] - total_toxic_comments, train.shape[0]))
count_labels

Plotting the graph for toxic comments

In [None]:
dist_plot = plt.figure(figsize=(12,6))
_=sns.barplot(x=count_labels.index,y=count_labels)
_=plt.xlabel('Toxicity Class')
_=plt.ylabel('Occurance')

We observe that the dataset is imbalanced. We have only considered those comments whhere the flag for toxicity of even one class is set to 1 and Still we observe much more Toxic comments than Servere toxic or Threat

Further exploratory shows that label toxic has the most observations in the training dataset while threat has the least.


In [None]:
# Adding 'none' columns = if all zero's than zero else 1
train['none'] = (train[toxic_label].max(axis=1) == 1).astype(int)
toxic_label.append('none')
train.head(10)
toxic_label

**Lets's check the correlation between various toxic comments**

In [None]:
rows = [{l:train[f].corr(train[l]) for l in toxic_label} for f in toxic_label]
train_corr = pd.DataFrame(rows, index=toxic_label)
train_corr

### Lets's check the correlation of these new features we have created and see if these assumption even hold true in any case.

In [None]:
# Let's make a heatmap for this correlation matrix
import seaborn as sns
corr_feature = sns.heatmap(train_corr, vmin=-1, vmax=1, center=0.0, annot=True)

The correlation matrix shows interesting things :

'toxic' is clearly correlated with 'obscene' and 'insult' (0.68 and 0.65)

'toxic' and 'severe_toxic' are only got a 0.31 correlation factor

'insult' and 'obscene' have a correlation factor of 0.74

From my point of view, there are several combinations that are worth digging into :

'toxic' <-> 'severe_toxic'. The semantic of these two categories seems to show some kind of graduation between them

'toxic' <-> 'insult' and 'toxic' <-> 'obscene'

'insult' <-> 'obscene'

### Let's check some assumpation which we have about the toxic comments we witness on social media and see if they have any correlation with the "Toxicity"

In [None]:
# Let's make a data frame copy
df_1 = train.copy()

# Total length of the comment text...
df_1['total_length'] = df_1['comment_text'].apply(len)

#Generally the people who are angry and write toxic comment use Capital letter words
df_1['CAPITAL_WORDS'] = df_1['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
df_1['CAPS_LENGTH'] = df_1.apply(lambda row: float(row['CAPITAL_WORDS'])/float(row['total_length']),axis=1)

# Toxic comments generally contain exclaimation marks (way to depict emotions)
df_1['exclamation_marks'] = df_1['comment_text'].apply(lambda comment: comment.count('!'))

# People who are angry and spread toxicity generally do engage in a civil discussions and ask questions
df_1['question_marks'] = df_1['comment_text'].apply(lambda comment: comment.count('?'))

# Angry and toxic people don't really care about proper grammar and hence punctuations
df_1['punctuation'] = df_1['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))

# Angry and toxic people generally tend to repet the words
df_1['unique_words'] = df_1['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))

In [None]:
features = ('total_length', 'CAPITAL_WORDS', 'CAPS_LENGTH', 'exclamation_marks',
           'question_marks', 'punctuation','unique_words')
labels = ('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
rows = [{l:df_1[f].corr(df_1[l]) for l in labels} for f in features]
df_1_corr = pd.DataFrame(rows, index=features)
df_1_corr

In [None]:
# Let's make a heatmap for this correlation matrix
import seaborn as sns
corr_extrafeature = sns.heatmap(df_1_corr, vmin= -0.1, vmax= 0.1, center=0.0,annot= True)

From the above heatmap we can conclude that a few of our assumptions are kind of accurate:

People do use CAPITAL letter words whne writing a toxic comment as we observe a positive correlation.

People don't really use proper grammar (eg punctuation) or unique words while being toxic.

Poeple tend to use exclamation (!!!) while angry to show the severity of their emotions

**Checking if the Toxic comments are longer or the Non-Toxic ones**

In [None]:
train['comment_length'] = train['comment_text'].apply(len)
train.head()

Below is the plot showing the comment length frequency. As noticed, most of the comments are short with only a few comments longer than 1000 words.

In [None]:
sns.distplot(train['comment_length'], kde=False, bins=20, color="steelblue")

In [None]:
# Let's visualize it
sns.set(palette='rocket')
plot, plotobj = plt.subplots(1,2,sharex=True, figsize=(15,6))
_=sns.distplot(np.log10(train.loc[train.toxic==1,'comment_length']),kde=False, bins=15,plotobj=plotobj[0])
_=plotobj[0].set_xlabel('log (Character length of comments)')
_=plotobj[0].set_ylabel('Count')
_=plotobj[0].set_title('Toxic comments')

_=sns.distplot(np.log10(train.loc[train.toxic==0,'comment_length']),kde=False, bins=15,plotobj=plotobj[1])
_=plotobj[1].set_xlabel('log (Character length of comments)')
_=plotobj[1].set_title('Non-Toxic comments')

print('Mean character length of toxic comments: {}'.format(train.loc[train.toxic==1,'comment_length'].mean()))
print('Mean character length of clean comments: {}'.format(train.loc[train.toxic==0,'comment_length'].mean()))

In [None]:
# word clouds
def W_Cloud(token):
# Visualize the most common words contributing to the token.
    threat_context = train[train[token] == 1]
    threat_text = threat_context.comment_text
    neg_text = pd.Series(threat_text).str.cat(sep=' ')
    wordcloud = WordCloud(width=1600, height=800,
                          max_font_size=200).generate(neg_text)

    plt.figure(figsize=(15, 10))
    plt.imshow(wordcloud.recolor(colormap="Blues"), interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Most common words assosiated with {token} comment", size=20)
    plt.show()

W_Cloud('identity_hate')

##Let's do Topic Modelling using LDA and plot the t-SNE to view the result in 2D

We won't distinguish between the various categories of toxicity. We'll mark comments from any of the six categories. To make it a little more interesting, our target variable will show the number of categories observed for each comment. For example, a comment classified as toxic, severe_toxic and obscene gets a 3. I'll color the t-SNE plot to highlight comments falling in 1 or more categories.

I took 20% of the data to reduce the clutter of the picture and speed things up. Running the whole train set shows similar results.

In [None]:
trainX = train['comment_text']
target = train.sum(axis=1).values

sss = StratifiedShuffleSplit(n_splits=5, train_size=0.20)
for train_index, test_index in sss.split(trainX, target):
    train_text = trainX.iloc[train_index] 
    train_tgt = target[train_index]

**Now we can count the word features and run LDA**

In [None]:
maxfeats = 5000
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    max_features=maxfeats)
word_vectorizer.fit(train_text)
train_features = word_vectorizer.transform(train_text)

classifier = LatentDirichletAllocation(n_components=16, learning_method= 'batch', n_jobs=3, verbose=1)
train_lda = classifier.fit_transform(train_features, train_tgt)
train_lda.shape

We can represent the topic vectors in 2-d with t-SNE. Finally we plot the results

In [None]:
## T-SNE (T-distributed stochastic neighbour embedding)

tsne_obj = TSNE(n_components=2, perplexity=8, n_iter=1000, verbose=1, angle=0.5) #perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms
train_tsne = tsne_obj.fit_transform(train_lda)
x_tsne = train_tsne[:, 0]
y_tsne = train_tsne[:, 1]

In [None]:
init_notebook_mode(connected=False)

# create datafRAME WITH comments, target, tsnex,tsney
#separate into 2 groups of x_nice, x_notnice, y_nice, y_notnice
plot = pd.DataFrame({'comment':train_text, 'class':train_tgt, 'x_axis': x_tsne, 'y_axis':y_tsne})
nonToxic = plot[plot['class'] == 0]
Toxic = plot[plot['class'] > 0]

In [None]:
nonToxic_Plot = Scatter(
    x = nonToxic['x_axis'],
    y = nonToxic['y_axis'],
    mode = 'markers',
    marker = dict(
      size=7,
      color='lightgray',
      symbol='circle',
      line = dict(width = 0,
        color='gray'),
      opacity = 0.3
     ),
    text=nonToxic['comment']
)

Toxic_Plot = Scatter(
    x = Toxic['x_axis'],
    y = Toxic['y_axis'],
    mode = 'markers',
    marker = dict(
      size=8,
      color=notnices['class'],
      symbol='triangle-up',
      line = dict(width = 0,
        color='Darkred'),
      opacity = 0.6
     ),
    text=Toxic['comment']
)

data=[nonToxic_Plot, Toxic_Plot]

layout = Layout(
    title = 'I see you Haters!!!!',
    showlegend=False,
    xaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=False,
        showline=False,
        
        
    ),
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=False,
        showline=False,
     )
)


In [None]:
import plotly
from plotly import version
print (version)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from IPython.display import HTML
LDA_Plot = Figure(data=data, layout=layout)
HTML(LDA_Plot.to_html())

## Text Preprocessing

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'([a-zA-Z]+)') 

def get_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def preprocess_comment(input_str):
    comment=re.sub(r"(\d{1,3}\.){1,3}\d{1,3}","",input_str)
    words = [word for word in tokenizer.tokenize(comment.lower()) if not word in stop_words]
    lemmatized_sentence = ' '.join([lemmatizer.lemmatize(word,pos = get_pos(word)) for word in words if len(word)>2])
    return lemmatized_sentence

In [None]:
# An example
print('Original text:\n {}'.format(train['comment_text'][20]))
print('After cleaning:\n {}'.format(preprocess_comment(train['comment_text'][20])))

In [None]:
train['cleaned_comment_text']= train['comment_text'].apply(lambda comment: preprocess_comment(comment))
train.head()

In [None]:
test['cleaned_comment_text'] = test['comment_text'].apply(lambda comment: preprocess_comment(comment))
test.head()

Applyting sentiment analysis - Text blob

In [None]:
from textblob import TextBlob
#Create a function to get the polarity
def getPolarity(text):
  return TextBlob(text).polarity

def getSentiment(score):
  if score < 0:
    return 'Negative'
  elif score == 0:
    return 'Neutral'
  else:
    return 'Positive'

train['TextBlob_Score'] = train['cleaned_comment_text'].apply(getPolarity)
train['TextBlob_Sentiment'] = train['TextBlob_Score'].apply(getAnalysis)
train.head()

VADER Sentiment Analysis

In [None]:
# Vader sentiment analysis
SentAnaylzer = SentimentIntensityAnalyzer()
def getVaderScore(sent):
  return SentAnaylzer.polarity_scores(sent)["compound"]

def getVaderSentiment(score):
    if score >= 0.05:
      return 'Positive'
    elif score <= -0.05:
      return 'Negative'
    else:
      return 'Neutral'

train['Vader_Score'] = train['cleaned_comment_text'].apply(getVaderScore)
train['Vader_Sentiment'] = train['Vader_Score'].apply(getVaderSentiment)
train.head()

In [None]:
def toxic(sentiment):
    if sentiment == 'Negative':
      return 1
    else:
      return 0

train['TextBlob_Toxic'] = train['TextBlob_Sentiment'].apply(toxic)
train['Vader_Toxic'] = train['Vader_Sentiment'].apply(toxic)
train.head(10)

In [None]:
# Calculating accuracies of both methods
# accuracy of textblob
accuracy1 = 0
for i in range(len(train)):
  if train['none'][i] == train['TextBlob_Toxic'][i]:
    accuracy1 += 1
Accuracy1 = (accuracy1 / len(train))*100
print('Accuracy of TextBlob: = {}'.format(Accuracy1))

In [None]:
# accuracy of Vader
accuracy2 = 0
for i in range(len(train)):
  if train['none'][i] == train['Vader_Toxic'][i]:
    accuracy2 += 1
Accuracy2 = (accuracy2 / len(train))*100
print('Accuracy of Vader: = {}'.format(Accuracy2))

Since accuracy of Vader (73.58%) < TextBlob (77.37%), we will continue with TextBlob

Making a new training set with rows where textblob sentiment = 1

In [None]:
train_new = train.loc[train.iloc[:,train.columns.get_loc('TextBlob_Toxic')] == 1] 

# dropping uneccesary columns
train_new = train_new.drop(['Vader_Score','Vader_Sentiment','Vader_Toxic','TextBlob_Score','TextBlob_Sentiment','TextBlob_Toxic'], axis=1)
train_new

In [None]:
# After applying sentiment analysis, we got a new training set with 37072 rows
train_new.shape

Modelling and Evaluation

In [None]:
# Creating classifiers with default parameters initially.
clf1 = MultinomialNB()
clf2 = LogisticRegression(max_iter=10000)
clf3 = LinearSVC(max_iter=10000, dual=False)
clf4 = RandomForestClassifier()

Feature Vectorization - Count vectorizer

In [None]:
# CountVectorizer
count_vectorizer = CountVectorizer(max_features=30,ngram_range=(1, 2),
                                   analyzer='word',strip_accents='unicode',
                                   min_df=10, max_df = 200)  

X_train_count = count_vectorizer.fit_transform(train_new['cleaned_comment_text'])
X_test_count = count_vectorizer.transform(test['cleaned_comment_text'])

In [None]:
# TFIDF Vectorizer
tfidf_vector = TfidfVectorizer(
    ngram_range=(1, 2),          #Consier both unigrams and bi-grams
    analyzer='word',
    strip_accents='unicode', 
    use_idf=1, 
    min_df=10)

X_train_tfidf = tfidf_vector.fit_transform(train_new['cleaned_comment_text'])
X_test_tfidf = tfidf_vector.transform(test['cleaned_comment_text'])

In [None]:
# getting top 20 words
def get_topn_word_counts(vectorizer, feature_counts, n=10):
    count_docs = feature_counts.A.sum(axis=0)
    count_feature_names = vectorizer.get_feature_names()
    top_words_counts = sorted(zip(count_feature_names, count_docs), key= lambda x:x[1], reverse=True)
    word_count = pd.DataFrame(top_words_counts, columns = ['token', 'count'])[:n]
    return word_count

top_20_wordcounts_count = get_topn_word_counts(count_vectorizer, X_train_count, n=20)
top_20_wordcounts_count

In [None]:
# plotting top 20 words graph
twentywordplot = plt.figure(figsize=(20,10))
_ = sns.barplot(x="token", y="count", data=top_20_wordcounts_count)
_=plt.title('Top 20 words in the corpus for count vector')

In [None]:
# tfidf top 20 words
top_20_wordcounts_tfidf = get_topn_word_counts(tfidf_vector, X_train_tfidf, n=20)
top_20_wordcounts_tfidf

# plotting top 20 words graph
fig = plt.figure(figsize=(20,10))
_ = sns.barplot(x="token", y="count", data=top_20_wordcounts_tfidf)
_=plt.title('Top 20 words in the corpus for tfidf')

Checking on training data - Count vectorizer

In [None]:
# With CountVectorizer and TFIDF
test_labels = ["toxic", "severe_toxic", "obscene","threat", "insult", "identity_hate"]

def cross_validation_score(classifier, X_train, y_train):
    #Iterate though each label and return the cross validation F1, Recall and accuracy score 
    methods = []
    name = classifier.__class__.__name__.split('.')[-1]
    for label in test_labels:
        recall = cross_val_score(classifier, X_train, y_train[label], cv=10, scoring='recall')
        f1 = cross_val_score(classifier, X_train,y_train[label], cv=10, scoring='f1')
        accuracy = cross_val_score(classifier, X_train,y_train[label], cv=10, scoring='accuracy')
        methods.append([name, label, recall.mean(), f1.mean(), accuracy.mean()])
    return methods

In [None]:
# Calculating the cross validation F1 and Recall score for our 4 baseline models with CountVectorizer.
methods1_cv_count = pd.DataFrame(cross_validation_score(clf1, X_train_count, train_new))
methods2_cv_count = pd.DataFrame(cross_validation_score(clf2, X_train_count, train_new))
methods3_cv_count = pd.DataFrame(cross_validation_score(clf3, X_train_count, train_new))
#methods4_cv_count = pd.DataFrame(cross_validation_score(clf4, X_train_count, train_new))

In [None]:
# Creating a dataframe to show summary of results of CountVectorizer
methods_cv_count = pd.concat([methods1_cv_count, methods2_cv_count, methods3_cv_count]) #, methods4_cv_count])
methods_cv_count.columns = ['Model', 'Label', 'Recall', 'F1','Accuracy']
meth_cv_count = methods_cv_count.reset_index()
meth_cv_count[['Model', 'Label', 'Recall', 'F1','Accuracy']]

Feature Vectorization - TFIDF

In [None]:
# Calculating the cross validation F1 and Recall score for our 4 baseline models with TF-IDF.
methods1_cv_tfidf = pd.DataFrame(cross_validation_score(clf1, X_train_tfidf, train_new))
methods2_cv_tfidf = pd.DataFrame(cross_validation_score(clf2, X_train_tfidf, train_new))
methods3_cv_tfidf = pd.DataFrame(cross_validation_score(clf3, X_train_tfidf, train_new))
#methods4_cv_tfidf = pd.DataFrame(cross_validation_score(clf4, X_train_tfidf, train_new))

In [None]:
# Creating a dataframe to show summary of results of TF-IDF
methods_cv = pd.concat([methods1_cv_tfidf, methods2_cv_tfidf, methods3_cv_tfidf]) #, methods4_cv_tfidf])
methods_cv.columns = ['Model', 'Label', 'Recall', 'F1','Accuracy']
meth_cv_tfidf = methods_cv.reset_index()
meth_cv_tfidf[['Model', 'Label', 'Recall', 'F1','Accuracy']]

On test data:

In [None]:
# reading the test lables
test_y = pd.read_csv(r'C:\Users\ritu2\Desktop\UIC MSBA\Sem 2\Text Analytics\Project\Code\test_labels.csv')
test_y.head()
#len(test_y)  =  153164

In [None]:
def score(classifier, X_train, y_train, X_test, y_test):
    #Calculate F1, Recall for each label on test dataset.
    methods = []
    name = classifier.__class__.__name__.split('.')[-1]
    predict_df = pd.DataFrame()
    predict_df['id'] = test_y['id']

    for label in test_labels:
        classifier.fit(X_train, y_train[label])
        predicted = classifier.predict(X_test)
        predict_df[label] = predicted
        recall = recall_score(y_test[y_test[label] != -1][label],predicted[y_test[label] != -1],average="weighted")
        f1 = f1_score(y_test[y_test[label] != -1][label],predicted[y_test[label] != -1],average="weighted")
        accuracy = accuracy_score(y_test[y_test[label] != -1][label],predicted[y_test[label] != -1])
        conf_mat = confusion_matrix(y_test[y_test[label] != -1][label],predicted[y_test[label] != -1])
        methods.append([name, label, recall, f1, accuracy, conf_mat])
    return methods

Testing for count vectorizer

In [None]:
# Calculating the F1 and Recall score for our 4 models.
method1 = pd.DataFrame(score(clf1, X_train_count, train_new, X_test_count, test_y))
method2 = pd.DataFrame(score(clf2, X_train_count, train_new, X_test_count, test_y))
method3 = pd.DataFrame(score(clf3, X_train_count, train_new, X_test_count, test_y))
#method4 = pd.DataFrame(score(clf4, X_train_count, train_new, X_test_count, test_y))

In [None]:
methods = pd.concat([method1, method2, method3]) #, method4])
methods.columns = ['Model', 'Label', 'Recall', 'F1', 'Accuracy','Confusion_Matrix']
meth = methods.reset_index()
meth[['Model', 'Label', 'Recall', 'F1', 'Accuracy']]

In [None]:
# Visualizing F1 score results through box-plot.
ax = sns.boxplot(x='Model', y='Accuracy', data=methods, palette="Blues")
sns.stripplot(x='Model', y='Accuracy', data=methods, size=8, jitter=True, edgecolor="gray", linewidth=2, palette="Blues")
ax.set_xticklabels(ax.get_xticklabels(), rotation=20)
plt.show()

Testing for TFIDF

In [None]:
# Calculating the F1 and Recall score for our 3 baseline models.
method1t = pd.DataFrame(score(clf1, X_train_tfidf, train_new, X_test_tfidf, test_y))
method2t = pd.DataFrame(score(clf2, X_train_tfidf, train_new, X_test_tfidf, test_y))
method3t = pd.DataFrame(score(clf3, X_train_tfidf, train_new, X_test_tfidf, test_y))
#method4t = pd.DataFrame(score(clf4, X_train_tfidf, train_new, X_test_tfidf, test_y))

methods_t = pd.concat([method1t, method2t, method3t]) #, method4t])
methods_t.columns = ['Model', 'Label', 'Recall', 'F1', 'Accuracy','Confusion_Matrix']
meth_t = methods_t.reset_index()
meth_t[['Model', 'Label', 'Recall', 'F1', 'Accuracy']]

## Let's Explore another embedding algorithm "Word2Vec"

In [None]:
traindata_copy = train_new.copy()
split = 0.7
d_train = traindata_copy[:int(split*len(traindata_copy))]
d_val = traindata_copy[int((1-split)*len(traindata_copy)):]

stop_words = set(['all', "she'll", "don't", 'being', 'over', 'through', 
'yourselves', 'its', 'before', "he's", "when's", "we've", 'had', 'should',
"he'd", 'to', 'only', "there's", 'those', 'under', 'ours', 'has', 
"haven't", 'do', 'them', 'his', "they'll", 'very', "who's", "they'd", 
'cannot', "you've", 'they', 'not', 'during', 'yourself', 'him', 'nor', 
"we'll", 'did', "they've", 'this', 'she', 'each', "won't", 'where', 
"mustn't", "isn't", "i'll", "why's", 'because', "you'd", 'doing', 'some', 
'up', 'are', 'further', 'ourselves', 'out', 'what', 'for', 'while', 
"wasn't", 'does', "shouldn't", 'above', 'between', 'be', 'we', 'who', 
"you're", 'were', 'here', 'hers', "aren't", 'by', 'both', 'about', 'would', 
'of', 'could', 'against', "i'd", "weren't", "i'm", 'or', "can't", 'own', 
'into', 'whom', 'down', "hadn't", "couldn't", 'your', "doesn't", 'from', 
"how's", 'her', 'their', "it's", 'there', 'been', 'why', 'few', 'too', 
'themselves', 'was', 'until', 'more', 'himself', "where's", "i've", 'with', 
"didn't", "what's", 'but', 'herself', 'than', "here's", 'he', 'me', 
"they're", 'myself', 'these', "hasn't", 'below', 'ought', 'theirs', 'my', 
"wouldn't", "we'd", 'and', 'then', 'is', 'am', 'it', 'an', 'as', 'itself', 
'at', 'have', 'in', 'any', 'if', 'again', 'no', 'that', 'when', 'same', 
'how', 'other', 'which', 'you', "shan't", 'our', 'after', "let's", 'most', 
'such', 'on', "he'll", 'a', 'off', 'i', "she'd", 'yours', "you'll", 'so', 
"we're", "she's", 'the', "that's", 'having', 'once'])

def performance(y_true, pred, ann=True):
    acc = accuracy_score(y_true, pred[:,1]>0.5)
    auc = roc_auc_score(y_true, pred[:,1])
    fpr, tpr, thr = roc_curve(y_true, pred[:,1])
    plotfig, plotobj = plt.subplots(nrows=1, ncols=1, figsize=(16, 7))
    plt.plot(fpr, tpr, color='royalblue', linewidth="3")
    plt.xlabel("False positive rate (FPR)")
    plt.ylabel("True positive rate (TPR)")
    if ann:
        plotobj.annotate("Accuracy: %0.2f" % acc, (0.2,0.7), size=14)
        plotobj.annotate("AUC: %0.2f" % auc, (0.2,0.6), size=14)

def tokenize(docs):
    pattern = re.compile('[\W_]+', re.UNICODE)
    sentences = []
    for d in docs:
        sentence = d.lower().split(" ")
        sentence = [pattern.sub('', w) for w in sentence]
        sentences.append( [w for w in sentence if w not in stop_words] )
    return sentences

def w2v_feautrevec(model, sentences):
    f = np.zeros((len(sentences), model.vector_size))
    for i,s in enumerate(sentences):
        for w in s:
            try:
                vec = model[w]
            except KeyError:
                continue
            f[i,:] = f[i,:] + vec
        f[i,:] = f[i,:] / len(s)
    return f

def remove_nan(features):
    rows_to_delete = []
    for i in range(len(features)):
        if np.isnan(features[i].sum()):
            rows_to_delete.append(i)
    return rows_to_delete

In [None]:
sentences = tokenize(d_train.comment_text)
model = Word2Vec(sentences, size=500, window=5, min_count=6, sample=1e-3, workers=2)
model.init_sims(replace=True)

features = w2v_feautrevec(model, sentences)

rows_to_delete = remove_nan(features)
features = np.delete(features, rows_to_delete, 0)

modelw2v = RandomForestClassifier(n_estimators=600, n_jobs=-1, max_features="log2")
modelw2v.fit(features, d_train.toxic.drop(d_train.index[rows_to_delete]))

In [None]:
validation_sent = tokenize(d_val.comment_text)
features_val = featurize_w2v(model, validation_sent)

deleterows = remove_nan(features_val)
features_val = np.delete(features_val, deleterows, 0)

In [None]:
pred3 = model3.predict_proba(features_val)
performance(d_val.toxic.drop(d_val.index[deleterows]), pred3)