In [1]:
# Sentiment Analysis of Wikipedia Comments
# Comments are classified (model trained) based on Wikipedia Editor Comments, unanimously judged by those Editors
#   to be 'toxic' OR 'not toxic'
# Mostly outdated word embeddings/vectorizations based on tf-idf and count vectorizer

import nltk
import pandas as pd

# Merge comments with unanimous score file. Merge the first entry of scores grouped on 'review_id' with comments.
comments = pd.read_csv('res/toxicity_annotated_comments_unanimous.tsv', sep='\t')
scores = pd.read_csv('res/toxicity_annotations_unanimous.tsv', sep='\t')
uniqueScores = scores[['review_id', 'toxicity']].groupby('review_id', as_index=False).first()
df = pd.merge(comments, uniqueScores, on="review_id")

df = df[['review_id','comment', 'toxicity']]

In [2]:
# Downsample 'not toxic' comments
# Equalize number of 'toxic' AND 'not toxic' comments for training

df = df.groupby('toxicity', as_index=False).head(171)

In [3]:
# Open non-unanimously classified comments into df2, (we will predict the sentiment of these, 'toxic' OR 'not-toxic')
# 'Toxic' OR 'not-toxic' of non-unanimous comments is determined by averaging and rounding to 'zero' OR 'one'

comments2 = pd.read_csv('res/toxicity_annotated_comments.tsv', sep='\t')

scores2 = \
pd.read_csv('res/toxicity_annotations.tsv', sep='\t').groupby('review_id', as_index=False)['toxicity'].mean().round()

df2 = pd.merge(comments2, scores2, on='review_id')

df2 = df2[['review_id','comment','toxicity']]

In [4]:
df2 = df2.head(500)

In [5]:
# Remove HTML elements and 'NEWLINE_TOKEN'
from bs4 import BeautifulSoup

df['cleaned_comment'] = df.comment.apply(lambda x: BeautifulSoup(x, 'html5lib').get_text())
df['cleaned_comment'] = df.cleaned_comment.apply(lambda x: x.replace('NEWLINE_TOKEN', ''))

df2['cleaned_comment'] = df2.comment.apply(lambda x: BeautifulSoup(x, 'html5lib').get_text())
df2['cleaned_comment'] = df2.cleaned_comment.apply(lambda x: x.replace('NEWLINE_TOKEN', ''))

In [6]:
# Test results using this or not
# Remove non-(alpha|whitespace|apostrophe) chars, change to lowercase
# import re

# df['cleaned_comment'] = df.cleaned_comment.apply(lambda x: re.sub("[^a-zA-Z\s']", '', x))

In [7]:
# from sklearn.model_selection import train_test_split

# train_set, test_set = train_test_split(df, test_size=0.3, random_state=666)
all_words_train = df.cleaned_comment
all_words_test = df2.cleaned_comment

In [8]:
# TF-IDF Vectorizer (note: max_features changed from 4500 to 1000)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(stop_words='english', min_df=1, max_df=.4, norm='l2', ngram_range=(1,2), max_features=1000)

In [9]:
# Bag of Words Vectorizer (note: max_features changed from 1000 to 100)
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(stop_words="english", min_df=2, max_df=.6, ngram_range=(1,2), max_features=100)

In [10]:
from sklearn.pipeline import FeatureUnion

combined_features = FeatureUnion([("bagOwords", count_vec), ("tfidf", tfidf_vec)])

X_train = combined_features.fit_transform(all_words_train)
X_test = combined_features.transform(all_words_test)

In [11]:
from sklearn.linear_model import LogisticRegression

logRegClsfr = LogisticRegression(fit_intercept=True, penalty='l2', solver='newton-cg')

params = {
    #'penalty': ('l1', 'l2')
    #'fit_intercept': (True, False)
    #'solver' : ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga')
}

logRegClsfr.fit(X_train, df.toxicity)
print(logRegClsfr.score(X_test, df2.toxicity))

0.748


In [12]:
from sklearn.linear_model import SGDClassifier

sgdClsfr = SGDClassifier(loss='modified_huber', penalty='l2', fit_intercept=True, max_iter=1000)

params = {
    
    #"loss": ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
    #"penalty": ('none', 'l2', 'l1', 'elasticnet'),
    #"fit_intercept": (True,False),
    #"max_iter": (5,10,20,50,150,500,1000)
}

sgdClsfr.fit(X_train, df.toxicity)

print(sgdClsfr.score(X_test, df2.toxicity))

0.744


In [13]:
from sklearn.svm import SVC

svcClsfr = SVC(C=1.5, kernel='linear', degree=3, shrinking=True, probability=True, decision_function_shape='ovr')

# C=1.0, kernel=’rbf’, degree=3, gamma=’auto’, coef0=0.0, shrinking=True, probability=False, tol=0.001,
# cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=’ovr’, random_state=None
params = {
    #'C': (1.5,3.0,4.0),
    #'kernel': ('linear', 'sigmoid', 'rbf', 'poly', 'precomputed')
    #'degree': (3,1,2,4,5)
    #'shrinking': (True, False),
    #'probability': (True, False)
    #'decision_function_shape': ('ovr','ovo')
}

svcClsfr.fit(X_train, df.toxicity)
print(svcClsfr.score(X_test, df2.toxicity))

0.756


In [14]:
from sklearn.neural_network import MLPClassifier

nnClsfr = MLPClassifier(alpha = 1, hidden_layer_sizes=(100,20), max_iter=300)

# hidden_layer_sizes=(100, ), activation=’relu’, solver=’adam’, alpha=0.0001, batch_size=’auto’, 
# learning_rate=’constant’, learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, 
# tol=0.0001, verbose=False, warm_start=False, 
# momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, 
# beta_2=0.999, epsilon=1e-08)
params = {
    #'hidden_layer_sizes': ((100,),(200,),(100,20)),
    #'max_iter': (100,200,300)
}

nnClsfr.fit(X_train, df.toxicity)
print(nnClsfr.score(X_test, df2.toxicity))

0.744




In [15]:
# GridSearchCV, for iterating over hyper-parameters of classifiers
# Use this for tuning each model

# from sklearn.model_selection import GridSearchCV

# gridEst = GridSearchCV(nnClsfr, params).fit(X_train, train_set.toxicity_score)

# print(gridEst.best_params_)
# print(gridEst.score(X_test, test_set.toxicity_score))

In [16]:
from sklearn import tree

dtreeClsfr = tree.DecisionTreeClassifier()
dtreeClsfr.fit(X_train, df.toxicity)

print(dtreeClsfr.score(X_test, df2.toxicity))

0.68


In [17]:
from sklearn.ensemble import GradientBoostingClassifier

grboostClsfr = GradientBoostingClassifier()
grboostClsfr.fit(X_train, df.toxicity)

print(grboostClsfr.score(X_test, df2.toxicity))

0.768


In [18]:
from sklearn.naive_bayes import MultinomialNB
NBclassifier = MultinomialNB()

NBclassifier.fit(X_train, df.toxicity)

# resultNB = NBclassifier.predict(X_test)
    
print(NBclassifier.score(X_test, df2.toxicity))

0.904


In [19]:
# Classifier, Linear Model
from sklearn import linear_model
sgd_clsf = linear_model.SGDClassifier(max_iter=1000)
sgd_clsf.fit(X_train, df.toxicity)

#resultSGD = sgd_clsf.predict(X_test)

print(sgd_clsf.score(X_test, df2.toxicity))

0.744


In [20]:
from sklearn.svm import LinearSVC

linSVCclsf = LinearSVC()

# (penalty=’l2’, loss=’squared_hinge’, dual=True, tol=0.0001, C=1.0, 
# multi_class=’ovr’, fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, 
# random_state=None, max_iter=1000)
params = {
    
}

linSVCclsf.fit(X_train, df.toxicity)
print(linSVCclsf.score(X_test, df2.toxicity))

0.758


In [21]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=400, max_features=.1)

params = {
    #'n_estimators': (10,25,50,100,200,400),
    #'max_features': (.1,.2,.4,.6,.8,.9)
}

forest.fit(X_train, df.toxicity)
print(forest.score(X_test, df2.toxicity))

0.744


In [22]:
from sklearn.ensemble import VotingClassifier
all_clsf = VotingClassifier(estimators=[('multiNB', NBclassifier), ('randForest', forest), ('linSVC', linSVCclsf),\
                                       ('linModel', sgd_clsf), ('logreg', logRegClsfr),\
                                       ('svc', svcClsfr), ('nn', nnClsfr), ('dtree', dtreeClsfr), \
                                        ('grboost', grboostClsfr)])
all_clsf.fit(X_train, df.toxicity)
print(all_clsf.score(X_test, df2.toxicity))

0.764


  if diff:
