In [1]:
# bs4, remove non-alpha-apostrophe chars
# tfidf- max_df covers some domain words, min_df, ngram_range
# Compare to results with different/no data cleaning
# What is the accuracy of guessing all zeros? 49-full 89-unanimous
# Try other vectorizers/classifiers
# Derive features to show other corellations: length, num_exclaim, percent_caps, logged_in
# POS tagging features... number of verbs, nouns, etc

# use mean/median score for non-unanimous data set

# Features to engineer: length, percent caps, percent alpha, number exclaims, logged_in, 
# http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

import nltk
import pandas as pd

# Read and merge two files into df
comments = pd.read_csv('toxicity_annotated_comments_unanimous.tsv', sep='\t')
scores = pd.read_csv('toxicity_annotations_unanimous.tsv', sep='\t')
uniqueScores = scores[["rev_id", "toxicity_score", "toxicity"]].groupby("rev_id", as_index=False).first()
df = pd.merge(comments, uniqueScores, on="rev_id")

# df['length'] = df.comment.str.len()
print(df.columns)

Index(['rev_id', 'comment', 'year', 'logged_in', 'ns', 'sample', 'split',
       'toxicity_score', 'toxicity'],
      dtype='object')


In [2]:
# Open portion of non-unanimous data into df2
# Predict for non-unanimous data with models trained by unanimous data

comments2 = pd.read_csv('toxicity_annotated_comments.tsv', sep='\t')
# Mean scores rounded to nearest whole value
mean_scores = pd.read_csv('toxicity_annotations.tsv', sep='\t').groupby('rev_id', as_index=False)['toxicity_score'].mean().round()
df2 = pd.merge(comments2, mean_scores, on='rev_id')

In [3]:
# Remove HTML elements and 'NEWLINE_TOKEN'
from bs4 import BeautifulSoup
df['cleaned_comment'] = df.comment.apply(lambda x: BeautifulSoup(x, 'html5lib').get_text())

df['cleaned_comment'] = df.cleaned_comment.apply(lambda x: x.replace('NEWLINE_TOKEN', ''))

In [4]:
# Remove non-(alpha|whitespace|apostrophe) chars, change to lowercase
import re

df['cleaned_comment'] = df.cleaned_comment.apply(lambda x: re.sub("[^a-zA-Z\s']", '', x))
df['cleaned_comment'] = df.cleaned_comment.apply(str.lower)

In [5]:
#Remove rows with blank comments
df = df[df['cleaned_comment'].str.len()>0]

In [6]:
# Get percentage of zeroes
all_scores = df2.toxicity_score
num_zeroes = df2.toxicity_score[df2.toxicity_score==0]
print(len(num_zeroes)/len(all_scores))

0.6958531117317736


In [6]:
# from sklearn.model_selection import train_test_split

# train_set, test_set = train_test_split(df, test_size=0.3, random_state=666)
# all_words_train = train_set.cleaned_comment
# all_words_test = test_set.cleaned_comment

In [7]:
all_words_train = df.cleaned_comment
df2 = df2[:1000]
all_words_test = df2.comment

In [8]:
# Transformer Mixin - Feature Engineering - Derived Features - Length of Comment
from sklearn.base import TransformerMixin

class Length_Featurizer(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[len(i)] for i in X]

len_vec = Length_Featurizer()

# len_features = len_vec.transform(all_words_train)

In [12]:
# Transformer Mixin - Feature Engineering - Derived Features - Logged In
# Logged_In.transform needs the entire row, not just comment
import numpy as np
class Logged_In(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [[1] if df.iloc[i].logged_in==True else [0] for i, e in enumerate(X)]

logged_in_vec = Logged_In()

In [13]:
# TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=.5, ngram_range=(1, 3),  \
                             stop_words='english',  strip_accents='unicode',  norm='l2')

# print(tfidf_vectorizer.get_feature_names())

In [14]:
# Bag of Words Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words="english", min_df=2, max_df=.5, ngram_range=(1,3))

In [15]:
from sklearn.pipeline import FeatureUnion

combined_features = FeatureUnion([("bagOwords", count_vectorizer), ("tfidf", tfidf_vectorizer), ('lenvec', len_vec),\
                                 ("loggedIn", logged_in_vec)])

X_train = combined_features.fit_transform(all_words_train)
X_test = combined_features.transform(all_words_test)

In [16]:
# Classifier, Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
NBclassifier = MultinomialNB()
NBclassifier.fit(X_train, df.toxicity_score)

#resultNB = NBclassifier.predict(X_test)
    
print(NBclassifier.score(X_test, df2.toxicity_score))

0.72


In [17]:
# Classifier, Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB
bernNBclassifier = BernoulliNB()
bernNBclassifier.fit(X_train, df.toxicity_score)

#resultBernoulliNB = bernNBclassifier.predict(X_test)

print(bernNBclassifier.score(X_test, df2.toxicity_score))

0.631


In [23]:
# Classifier, linear model
from sklearn import linear_model
sgd_clsf = linear_model.SGDClassifier(max_iter=1000)
sgd_clsf.fit(X_train, df.toxicity_score)

#resultSGD = sgd_clsf.predict(X_test)

print(sgd_clsf.score(X_test, df2.toxicity_score))

0.704


In [24]:
# Classifier, Linear SVC
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
linSVCclsf = LinearSVC()
linSVCclsf.fit(X_train, df.toxicity_score)

#result_linearSVC= linSVCclsf.predict(X_test)

print(linSVCclsf.score(X_test, df2.toxicity_score))

0.67


In [25]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 1000)

forest = forest.fit(X_train, df.toxicity_score)

#resultForest = forest.predict(X_test)

print(forest.score(X_test, df2.toxicity_score))

0.745


In [26]:
from sklearn.ensemble import VotingClassifier
all_clsf = VotingClassifier(estimators=[('multiNB', NBclassifier), ('randForest', forest), ('linSVC', linSVCclsf),\
                                       ('linModel', sgd_clsf)])
all_clsf.fit(X_train, df.toxicity_score)
print(all_clsf.score(X_test, df2.toxicity_score))

0.723


  if diff:
