In [7]:
# TO-DO
# bs4, remove non-alpha-apostrophe chars
# tfidf- max_df covers some domain words, min_df, ngram_range
# Compare to results with different/no data cleaning
# What is the accuracy of guessing all zeros? 49%
# Try other vectorizers/classifiers
# Derive features to show other corellations: length, num_exclaim, percent_caps, logged_in
# POS tagging features... number of verbs, nouns, etc

# use mean/median score for non-unanimous data set

import nltk
import pandas as pd

# Read and merge two files into df
comments=pd.read_csv('toxicity_annotated_comments_unanimous.tsv', sep='\t')
scores=pd.read_csv('toxicity_annotations_unanimous.tsv', sep='\t')
uniqueScores = scores[["rev_id", "toxicity_score", "toxicity"]].groupby("rev_id", as_index=False).first()
df = pd.merge(comments, uniqueScores, on="rev_id")

df['length'] = df.comment.str.len()


In [8]:
# Remove HTML elements and 'NEWLINE_TOKEN'
from bs4 import BeautifulSoup
df['cleaned_comments'] = df.comment.apply(lambda x: BeautifulSoup(x, 'html5lib').get_text())

df['cleaned_comment'] = df.cleaned_comments.apply(lambda x: x.replace('NEWLINE_TOKEN', ''))

In [9]:
# Remove non-(alpha|whitespace|apostrophe) chars, change to lowercase
import re

df['cleaned_comment'] = df.cleaned_comment.apply(lambda x: re.sub("[^a-zA-Z\s']", '', x))
df['cleaned_comment'] = df.cleaned_comment.apply(str.lower)

In [10]:
#Remove rows with blank comments
df = df[df['cleaned_comment'].str.len()>0]

In [11]:
# Get percentage of zeroes
all_scores = df.toxicity_score
num_zeroes = df.toxicity_score[df.toxicity_score==0]
print(len(num_zeroes)/len(all_scores))

0.8896956157497906


In [12]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.3, random_state=666)
all_words_train = train_set.cleaned_comment
all_words_test = test_set.cleaned_comment

In [13]:
# TF-IDF Vectorizer (Try other alternatives as well)
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 3),  \
                             stop_words='english',  strip_accents='unicode',  norm='l2')

X_train = vectorizer.fit_transform(all_words_train)
X_test = vectorizer.transform(all_words_test)


In [14]:
# Classifier, Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
NBclassifier = MultinomialNB()
NBclassifier.fit(X_train, train_set.toxicity_score)

#resultNB = NBclassifier.predict(X_test)

# count = 0
# for i,x,y in zip(resultNB, test_set.toxicity_score):
#     if x == y: count+=1
# print(count/len(resultNB))
    
print(NBclassifier.score(X_test, test_set.toxicity_score))

0.893953488372093


In [15]:
# Classifier, Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB
bernNBclassifier = BernoulliNB()
bernNBclassifier.fit(X_train, train_set.toxicity_score)

#resultBernoulliNB = bernNBclassifier.predict(X_test)

print(bernNBclassifier.score(X_test, test_set.toxicity_score))

0.9051162790697674


In [16]:
# Classifier, linear model
from sklearn import linear_model
sgd_clsf = linear_model.SGDClassifier(max_iter=90)
sgd_clsf.fit(X_train, train_set.toxicity_score)

#resultSGD = sgd_clsf.predict(X_test)

print(sgd_clsf.score(X_test, test_set.toxicity_score))

0.9283720930232559


In [18]:
# Classifier, Linear SVC
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
linSVCclsf = LinearSVC()
linSVCclsf.fit(X_train, train_set.toxicity_score)

#result_linearSVC= linSVCclsf.predict(X_test)

print(linSVCclsf.score(X_test, test_set.toxicity_score))

0.9330232558139535


In [19]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100)

forest = forest.fit(X_train, train_set.toxicity_score)

resultForest = forest.predict(X_test)

print(forest.score(X_test, test_set.toxicity_score))



0.9227906976744186


In [None]:
from sklearn.ensemble import VotingClassifier
all_clsf = VotingClassifier(estimators=[('multiNB', NBclassifier), ('randForest', forest), ('linSVC', linSVCclsf),\
                                       ('linModel', sgd_clsf)])
all_clsf.fit(X_train, train_set.toxicity_score)
print(all_clsf.score(X_test, test_set.toxicity_score))