In [1]:
%config IPCompleter.greedy = True
#System
import time
import re
import itertools
import string
#processing
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.preprocessing import MultiLabelBinarizer

#feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn_pandas import DataFrameMapper, cross_val_score

#classifiers
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier, Perceptron, RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#Evaluation
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, fbeta_score
from sklearn.model_selection import train_test_split

#Report
import matplotlib.pyplot as plt   
import seaborn as sns


In [2]:
#load data
dataFrameTraining = pd.read_csv("data/Toxic_train_set.csv")
dataFrameTesting = pd.read_csv("data/Toxic_test_set.csv")

dftr = dataFrameTraining
dfte = dataFrameTesting

all_categories = ["obscene", "threat", "insult", "hate", "Intolerant"]

In [3]:
data = pd.concat([dftr, dfte], ignore_index=True )

In [4]:
stopWords = nltk.corpus.stopwords.words('english')
new_stop_words = ['?','!',',','.',';','&','>','<',')','(','/','\'s','\'\'','``']
stopWords.extend(new_stop_words)
new_stop_words_1 = ['I','thi','He','We','hi','everi','like','boy','march']
stopWords.extend(new_stop_words_1)

ps = nltk.PorterStemmer()

In [5]:
#for countvectorizer
def comment_clean_cv(comment):
    comment = "".join([word.lower() for word in comment if word not in string.punctuation])
    tokens = re.split('\W+', comment)
    comment = [ps.stem(word) for word in tokens if word not in stopWords]
    return comment

In [7]:
data['comment_length'] = data['Comments'].apply(lambda x: len(x) - x.count(" "))

In [8]:
def polarityReturn(text):    
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

In [9]:
data['sentiment'] = data['Comments'].apply(polarityReturn)

In [10]:
from textblob import TextBlob
def polarityReturn(text):    
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None
# adding new feature of sentiment polarity rating
data['sentiment'] = data['Comments'].apply(polarityReturn)


In [11]:
def getLabelList(data):
    categories = []
    for i in range(len(data)):            
        tempList = []
        for category in all_categories:        
            if data[category][i] == 1:
                tempList.append(category)   
        #print(tempList, i)
                #print(data[category][i], category, i)            
        categories.append(tempList)
    return categories

In [12]:
# making labels single list
train_categories = tuple(getLabelList(dftr))
test_categories = tuple(getLabelList(dfte))

In [13]:
labels = train_categories + test_categories

In [43]:
train_documents = tuple(data['Comments'])
def tf_idf(docs):
    tfidf = TfidfVectorizer(tokenizer=comment_clean_cv, max_features=105, use_idf=True, sublinear_tf=True)
    tfidf.fit(docs)
    return tfidf
representer = tf_idf(train_documents)

In [46]:
mlb = MultiLabelBinarizer()
target = mlb.fit_transform(labels)

In [48]:
dfmFeatures = DataFrameMapper([    
    ('Comments', representer),
    ('comment_length', None),
    ('sentiment', None)
])
features = dfmFeatures.fit_transform(data)

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.3, train_size=0.7, random_state=13)

In [51]:
classifiers = [
    ('DecisionTreeClassifier', OneVsRestClassifier(DecisionTreeClassifier())),
    ('LinearSVC', OneVsRestClassifier(LinearSVC(random_state=23))),        
    ('LogisticRegression', OneVsRestClassifier(LogisticRegression())),    
    ('LogisticRegressionCV', OneVsRestClassifier(LogisticRegressionCV())),
    ('SGDClassifier', OneVsRestClassifier(SGDClassifier())),
    ('Perceptron', OneVsRestClassifier(Perceptron())),
    ('RidgeClassifierCV', OneVsRestClassifier(RidgeClassifierCV())),
    ('RandomForestClassifier', OneVsRestClassifier(RandomForestClassifier(n_estimators=100, n_jobs=10))),        
    ('AdaBoostClassifier', OneVsRestClassifier(AdaBoostClassifier())),    
    ('ExtraTreesClassifier', OneVsRestClassifier(ExtraTreesClassifier())),        
    ('KNeighborsClassifier', OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5))),    
    ('MLPClassifier', OneVsRestClassifier(MLPClassifier())),    
]

In [52]:
print(("{clf_name:<30}: {score:<5}  in {train_time:>5} /  {test_time}")
      .format(clf_name="Classifier", score="score", train_time="train", test_time="test"))
print("-" * 80)
cls_dict = {}
for clf_name, classifier in classifiers:
    t0 = time.time()
    y_pred = classifier.fit(X_train, Y_train)
    cls_dict.update({clf_name: y_pred})
    t1 = time.time()
    
    preds = classifier.predict(X_test)
    preds[preds >= 0.5] = 1
    preds[preds < 0.5] = 0    
    t2 = time.time()

    acc = accuracy_score(y_true=Y_test, y_pred=preds)
    f1 = fbeta_score(y_true=Y_test, y_pred=preds, beta=1, average="weighted")
    print(("{clf_name:<30}: {acc:0.2f}% {f1:0.2f}% in {train_time:0.2f}s"
           " train / {test_time:0.2f}s test")
          .format(clf_name=clf_name,
                  acc=(acc * 100),
                  f1=(f1 * 100),
                  train_time=t1 - t0,
                  test_time=t2 - t1))       

Classifier                    : score  in train /  test
--------------------------------------------------------------------------------
DecisionTreeClassifier        : 80.33% 92.87% in 0.02s train / 0.00s test
LinearSVC                     : 75.41% 86.62% in 0.07s train / 0.00s test
LogisticRegression            : 49.18% 75.74% in 0.02s train / 0.00s test
LogisticRegressionCV          : 85.25% 93.39% in 5.73s train / 0.00s test
SGDClassifier                 : 0.00% 45.19% in 0.02s train / 0.00s test
Perceptron                    : 0.00% 29.37% in 0.02s train / 0.00s test
RidgeClassifierCV             : 88.52% 94.45% in 0.05s train / 0.00s test


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


RandomForestClassifier        : 78.69% 91.22% in 1.63s train / 0.57s test
AdaBoostClassifier            : 93.44% 97.71% in 0.68s train / 0.04s test
ExtraTreesClassifier          : 75.41% 90.02% in 0.11s train / 0.01s test
KNeighborsClassifier          : 13.11% 38.87% in 0.01s train / 0.01s test
MLPClassifier                 : 3.28% 24.30% in 0.69s train / 0.00s test


  'precision', 'predicted', average, warn_for)


In [57]:
Y_test_predict = cls_dict['LinearSVC'].predict(X_test)

[precision, recall, F1, support] = \
precision_recall_fscore_support(Y_test, Y_test_predict, average='samples')
accuracy = accuracy_score(Y_test, Y_test_predict)
print("Accuracy: {}, Precision: {},Recall: {}, F1: {}".format(accuracy, precision, recall, F1))

Accuracy: 0.7540983606557377, Precision: 0.8114754098360656,Recall: 0.7950819672131147, F1: 0.7950819672131147


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
