In [2]:
import nltk
import pandas as pd

# Read and merge two files into df
comments = pd.read_csv('toxicity_annotated_comments_unanimous.tsv', sep='\t')
scores = pd.read_csv('toxicity_annotations_unanimous.tsv', sep='\t')
uniqueScores = scores[["rev_id", "toxicity_score", "toxicity"]].groupby("rev_id", as_index=False).first()
df = pd.merge(comments, uniqueScores, on="rev_id")

df['number_exclamation'] = df.comment.apply(lambda x: x.count('!'))

In [3]:
# Remove HTML elements and 'NEWLINE_TOKEN'
from bs4 import BeautifulSoup
df['cleaned_comment'] = df.comment.apply(lambda x: BeautifulSoup(x, 'html5lib').get_text())

df['cleaned_comment'] = df.cleaned_comment.apply(lambda x: x.replace('NEWLINE_TOKEN', ''))

In [4]:
# Remove non-(alpha|whitespace|apostrophe) chars, change to lowercase
import re

df['cleaned_comment'] = df.cleaned_comment.apply(lambda x: re.sub("[^a-zA-Z\s']", '', x))

def count_capitals(someString):
    count = 0
    for char in someString:
        if char.isupper():
            count += 1
    return count

df['percent_caps'] = df.cleaned_comment.apply(count_capitals)
df['cleaned_comment'] = df.cleaned_comment.apply(str.lower)

In [5]:
#Remove rows with blank comments
df = df[df['cleaned_comment'].str.len()>0]

In [6]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.3, random_state=666)
all_words_train = train_set.cleaned_comment
all_words_test = test_set.cleaned_comment

In [7]:
# Get percentage of zeroes in test set, (a general baseline to beat)
all_scores = test_set.toxicity_score
num_zeroes = test_set.toxicity_score[test_set.toxicity_score==0]
print(len(num_zeroes)/len(all_scores))

0.8911627906976745


In [8]:
# Feature Engineering - Number of Exclamation symbols
from sklearn.base import TransformerMixin

class Exclamation_Featurizer(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[df.iloc[i].number_exclamation] for i, e in enumerate(X)]

exclam_vec = Exclamation_Featurizer()

In [9]:
# Feature Engineering - Percent Capital Letters

class Capital_Featurizer(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[df.iloc[i].percent_caps] for i, e in enumerate(X)]

capital_vec = Capital_Featurizer()

In [10]:
# Feature Engineering - Length of Comment

class Length_Featurizer(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[len(i)] for i in X]

len_vec = Length_Featurizer()

In [11]:
# Feature Engineering - Logged In

class Logged_In(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[1] if df.iloc[i].logged_in==True else [0] for i, e in enumerate(X)]

logged_in_vec = Logged_In()

In [12]:
# TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=.4, ngram_range=(1, 3),  \
                             stop_words='english',  strip_accents='unicode',  norm='l2', max_features=5000)


In [13]:
# Bag of Words Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words="english", min_df=2, max_df=.5, ngram_range=(1,3))

In [14]:
from sklearn.pipeline import FeatureUnion

combined_features = FeatureUnion([("bagOwords", count_vectorizer), ("tfidf", tfidf_vectorizer), ('lenvec', len_vec),\
                                 ("loggedIn", logged_in_vec), ("numExclams", exclam_vec), ("percentCaps", capital_vec)])

X_train = combined_features.fit_transform(all_words_train)
X_test = combined_features.transform(all_words_test)

In [15]:
# Classifier, Logistic Regression
from sklearn.linear_model import LogisticRegression

logRegClsfr = LogisticRegression()

In [16]:
# Classifier, MLP
from sklearn.neural_network import MLPClassifier

nnClsfr = MLPClassifier(alpha = 1)

In [17]:
# Classifier, Decision Tree
from sklearn import tree

dtreeClsfr = tree.DecisionTreeClassifier()

In [18]:
# Classifier, Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

grboostClsfr = GradientBoostingClassifier()

In [19]:
# Classifier, Linear SVC
from sklearn.svm import LinearSVC

linSVCclsf = LinearSVC()

In [20]:
# Classifier, Random Forest
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 1000)

In [22]:
from sklearn.ensemble import VotingClassifier
all_clsf = VotingClassifier(estimators=[('randForest', forest), ('linSVC', linSVCclsf),\
                                        ('logreg', logRegClsfr), ('nn', nnClsfr),\
                                        ('dtree', dtreeClsfr), ('grboost', grboostClsfr)])
all_clsf.fit(X_train, train_set.toxicity_score)
print('Accuracy of predictions on test_set is ', all_clsf.score(X_test, test_set.toxicity_score))

Accuracy of predictions on test_set is  0.9376744186046512


  if diff:
