In [1]:
# POS tagging features... number of verbs, nouns, etc

# use mean/median score for non-unanimous data set

# Features to engineer: percent alpha 
# http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

import nltk
import pandas as pd

# Read and merge two files into df
comments = pd.read_csv('toxicity_annotated_comments_unanimous.tsv', sep='\t')
scores = pd.read_csv('toxicity_annotations_unanimous.tsv', sep='\t')
uniqueScores = scores[["rev_id", "toxicity_score", "toxicity"]].groupby("rev_id", as_index=False).first()
df = pd.merge(comments, uniqueScores, on="rev_id")

# df['length'] = df.comment.str.len()
df['number_exclamation'] = df.comment.apply(lambda x: x.count('!'))

In [None]:
# Open portion of non-unanimous data into df2
# Predict for non-unanimous data with models trained by unanimous data

comments2 = pd.read_csv('toxicity_annotated_comments.tsv', sep='\t')
# Mean scores rounded to nearest whole value
mean_scores = pd.read_csv('toxicity_annotations.tsv', sep='\t').groupby('rev_id', as_index=False)['toxicity_score'].mean().round()
df2 = pd.merge(comments2, mean_scores, on='rev_id')

In [2]:
# Remove HTML elements and 'NEWLINE_TOKEN'
from bs4 import BeautifulSoup
df['cleaned_comment'] = df.comment.apply(lambda x: BeautifulSoup(x, 'html5lib').get_text())

df['cleaned_comment'] = df.cleaned_comment.apply(lambda x: x.replace('NEWLINE_TOKEN', ''))

In [3]:
# Remove non-(alpha|whitespace|apostrophe) chars, change to lowercase
import re

df['cleaned_comment'] = df.cleaned_comment.apply(lambda x: re.sub("[^a-zA-Z\s']", '', x))

def count_capitals(someString):
    count = 0
    for char in someString:
        if char.isupper():
            count += 1
    return count

df['percent_caps'] = df.cleaned_comment.apply(count_capitals)
df['cleaned_comment'] = df.cleaned_comment.apply(str.lower)

In [4]:
#Remove rows with blank comments
df = df[df['cleaned_comment'].str.len()>0]

In [5]:
# Get percentage of zeroes
all_scores = df.toxicity_score
num_zeroes = df.toxicity_score[df.toxicity_score==0]
print(len(num_zeroes)/len(all_scores))

0.8896956157497906


In [13]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.3, random_state=666)
all_words_train = train_set.cleaned_comment
all_words_test = test_set.cleaned_comment

In [None]:
all_words_train = df.cleaned_comment
df2 = df2[:5000]
all_words_test = df2.comment

In [None]:
# Get percentage of zeroes
all_scores = df2.toxicity_score
num_zeroes = df2.toxicity_score[df2.toxicity_score==0]
print(len(num_zeroes)/len(all_scores))

In [6]:
# Feature Engineering - Number of Exclamation symbols
from sklearn.base import TransformerMixin

class Exclamation_Featurizer(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[df.iloc[i].number_exclamation] for i, e in enumerate(X)]

exclam_vec = Exclamation_Featurizer()

In [7]:
# Feature Engineering - Percent Capital Letters

class Capital_Featurizer(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[df.iloc[i].percent_caps] for i, e in enumerate(X)]

capital_vec = Capital_Featurizer()

In [8]:
# Feature Engineering - Length of Comment

class Length_Featurizer(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[len(i)] for i in X]

len_vec = Length_Featurizer()

In [9]:
# Feature Engineering - Logged In

class Logged_In(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[1] if df.iloc[i].logged_in==True else [0] for i, e in enumerate(X)]

logged_in_vec = Logged_In()

In [10]:
# TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=.4, ngram_range=(1, 3),  \
                             stop_words='english',  strip_accents='unicode',  norm='l2', max_features=5000)

# print(tfidf_vectorizer.get_feature_names())

In [11]:
# Bag of Words Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words="english", min_df=2, max_df=.5, ngram_range=(1,3))

In [14]:
from sklearn.pipeline import FeatureUnion

combined_features = FeatureUnion([("bagOwords", count_vectorizer), ("tfidf", tfidf_vectorizer), ('lenvec', len_vec),\
                                 ("loggedIn", logged_in_vec), ("numExclams", exclam_vec), ("percentCaps", capital_vec)])

X_train = combined_features.fit_transform(all_words_train)
X_test = combined_features.transform(all_words_test)

In [16]:
# Classifier, Logistic Regression
from sklearn.linear_model import LogisticRegression

logRegClsfr = LogisticRegression()
logRegClsfr.fit(X_train, train_set.toxicity_score)

# print(logRegClsfr.predict(tfidf_vectorizer.transform(([['go fuck yurself']]))))
print(logRegClsfr.score(X_test, test_set.toxicity_score))

0.9255813953488372


In [17]:
# Classifier, K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

knborsClsfr = KNeighborsClassifier(n_neighbors=5, weights='distance')
knborsClsfr.fit(X_train, train_set.toxicity_score)

print(knborsClsfr.score(X_test, test_set.toxicity_score))

0.892093023255814


In [18]:
# Classifier, SVC
from sklearn.svm import SVC

svcClsfr = SVC()
svcClsfr.fit(X_train, train_set.toxicity_score)

print(svcClsfr.score(X_test, test_set.toxicity_score))

0.8958139534883721


In [None]:
from sklearn.svm import NuSVC

In [19]:

from sklearn.neural_network import MLPClassifier

nnClsfr = MLPClassifier(alpha = 1)
nnClsfr.fit(X_train, train_set.toxicity_score)

print(nnClsfr.score(X_test, test_set.toxicity_score))

0.9172093023255814


In [20]:
from sklearn import tree

dtreeClsfr = tree.DecisionTreeClassifier()
dtreeClsfr.fit(X_train, train_set.toxicity_score)

print(dtreeClsfr.score(X_test, test_set.toxicity_score))

0.9255813953488372


In [21]:

from sklearn.ensemble import GradientBoostingClassifier

grboostClsfr = GradientBoostingClassifier()
grboostClsfr.fit(X_train, train_set.toxicity_score)

print(grboostClsfr.score(X_test, test_set.toxicity_score))

0.9386046511627907


In [22]:
# Classifier, Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
NBclassifier = MultinomialNB()
NBclassifier.fit(X_train, train_set.toxicity_score)

#resultNB = NBclassifier.predict(X_test)
    
print(NBclassifier.score(X_test, test_set.toxicity_score))

0.8362790697674418


In [23]:
# Classifier, Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB
bernNBclassifier = BernoulliNB()
bernNBclassifier.fit(X_train, train_set.toxicity_score)

#resultBernoulliNB = bernNBclassifier.predict(X_test)

print(bernNBclassifier.score(X_test, test_set.toxicity_score))

0.9051162790697674


In [24]:
# Classifier, Linear Model
from sklearn import linear_model
sgd_clsf = linear_model.SGDClassifier(max_iter=1000)
sgd_clsf.fit(X_train, train_set.toxicity_score)

#resultSGD = sgd_clsf.predict(X_test)

print(sgd_clsf.score(X_test, test_set.toxicity_score))

0.8958139534883721


In [25]:
# Classifier, Linear SVC
from sklearn.svm import LinearSVC

linSVCclsf = LinearSVC()
linSVCclsf.fit(X_train, train_set.toxicity_score)

#result_linearSVC= linSVCclsf.predict(X_test)

print(linSVCclsf.score(X_test, test_set.toxicity_score))

0.933953488372093


In [26]:

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 1000)

forest = forest.fit(X_train, train_set.toxicity_score)

#resultForest = forest.predict(X_test)

print(forest.score(X_test, test_set.toxicity_score))

0.933953488372093


In [28]:
from sklearn.ensemble import VotingClassifier
all_clsf = VotingClassifier(estimators=[('multiNB', NBclassifier), ('randForest', forest), ('linSVC', linSVCclsf),\
                                       ('linModel', sgd_clsf), ('logreg', logRegClsfr), ('knbors', knborsClsfr),\
                                       ('svc', svcClsfr), ('nn', nnClsfr), ('dtree', dtreeClsfr), ('grboost', grboostClsfr)])
all_clsf.fit(X_train, train_set.toxicity_score)
print(all_clsf.score(X_test, test_set.toxicity_score))

0.9283720930232559


  if diff:


In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
sc = StandardScaler(with_mean=False)

X_train_other = sc.fit_transform(X_train.toarray(), train_set.toxicity_score)
X_test_other = sc.transform(X_test)

classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
classifier.fit(X_train_other, train_set.toxicity_score)

print(classifier.score(X_test, test_set.toxicity_score))
