In [217]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import pandas as pd
import nltk
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier, Perceptron, RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import string
import re
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, fbeta_score

In [203]:
#load data
toxicWordsTrain = pd.read_csv("data/Toxic_train_set.csv");
toxicWordsTest = pd.read_csv("data/Toxic_test_set.csv");

list_classes = ["obscene", "threat", "insult", "hate", "Intolerant"]
x_train = toxicWordsTrain["Comments"]
y_train = toxicWordsTrain[list_classes].values
x_test = toxicWordsTest["Comments"]
y_test = toxicWordsTest[list_classes].values

In [204]:
stopWords = nltk.corpus.stopwords.words('english')
new_stop_words = ['?','!',',','.',';','&','>','<',')','(','/','\'s','\'\'','``']
stopWords.extend(new_stop_words)
new_stop_words_1 = ['I','thi','He','We','hi','everi','like','boy','march']
stopWords.extend(new_stop_words_1)

ps = nltk.PorterStemmer()

In [221]:
def comment_clean_ngram(comment):
    comment = "".join([word.lower() for word in comment if word not in string.punctuation])
    tokens = re.split('\W+', comment)
    comment = " ".join([ps.stem(word) for word in tokens if word not in stopWords])
    return comment

# dftr['Comments'] = dftr['Comments'].apply(lambda x: comment_clean_ngram(x))
# dftr.head()

In [255]:
vectUni = CountVectorizer()
tfidfUni = TfidfTransformer()

classifiers = [
    ('DecisionTreeClassifier', OneVsRestClassifier(DecisionTreeClassifier())),
    ('LinearSVC', OneVsRestClassifier(LinearSVC(random_state=23))),        
    ('LogisticRegression', OneVsRestClassifier(LogisticRegression())),    
    ('LogisticRegressionCV', OneVsRestClassifier(LogisticRegressionCV())),
    ('SGDClassifier', OneVsRestClassifier(SGDClassifier())),
    ('Perceptron', OneVsRestClassifier(Perceptron())),
    ('RidgeClassifierCV', OneVsRestClassifier(RidgeClassifierCV())),
    ('RandomForestClassifier', OneVsRestClassifier(RandomForestClassifier(n_estimators=100, n_jobs=10))),        
    ('AdaBoostClassifier', OneVsRestClassifier(AdaBoostClassifier())),    
    ('ExtraTreesClassifier', OneVsRestClassifier(ExtraTreesClassifier())),        
    ('KNeighborsClassifier', OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5))),    
    ('MLPClassifier', OneVsRestClassifier(MLPClassifier())),    
]

In [256]:
cls_dict = {}
i = 1
for clf_name, classifier in classifiers:           
    pipeline = Pipeline([
        ('vectorUnigram', vectUni),
        ('tfidf', tfidf),             
        ('classifier', OneVsRestClassifier(classifier))
    ])
    model = pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test) 
    cls_dict.update({clf_name: model})
    
    acc = accuracy_score(y_true=y_test, y_pred=y_pred)
    f1 = fbeta_score(y_true=y_test, y_pred=y_pred, beta=1, average="weighted")
    print(("{clf_name:<30}: {acc:0.2f}% {f1:0.2f}%").format(clf_name=clf_name, acc=(acc * 100), f1=(f1 * 100)))    

DecisionTreeClassifier        : 51.61% 79.92%
LinearSVC                     : 64.52% 81.24%
LogisticRegression            : 29.03% 59.96%


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


LogisticRegressionCV          : 64.52% 78.95%
SGDClassifier                 : 66.13% 83.40%
Perceptron                    : 66.13% 84.59%
RidgeClassifierCV             : 59.68% 81.14%


  'precision', 'predicted', average, warn_for)


RandomForestClassifier        : 43.55% 69.64%
AdaBoostClassifier            : 62.90% 85.36%
ExtraTreesClassifier          : 51.61% 78.93%
KNeighborsClassifier          : 51.61% 76.57%




MLPClassifier                 : 56.45% 76.40%




In [190]:
# a,p,r,f1

Y_test_predict = cls_dict['DecisionTreeClassifier'].predict(x_test)

[precision, recall, F1, support] = \
precision_recall_fscore_support(y_test, Y_test_predict, average='samples')
accuracy = accuracy_score(y_test, Y_test_predict)
print("Accuracy: {}, Precision: {},Recall: {}, F1: {}".format(accuracy, precision, recall, F1))

Accuracy: 0.45161290322580644, Precision: 0.6344086021505375,Recall: 0.6774193548387096, F1: 0.6489247311827957


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [257]:
lvec = cls_dict['AdaBoostClassifier'].predict_proba(["I kill muslim you"])

for label, vec in zip(list_classes, lvec[0]):
    print("{}: {}".format(label, vec))
    

obscene: 0.3885015797442679
threat: 0.13239931710091143
insult: 0.3343813263502749
hate: 0.14982671710621948
Intolerant: 0.5877264708245531


In [194]:
for model, cls in zip(cls_dict, classifiers):    
    print(cls[0])
    #print(models[model])
    print(classification_report(y_test, models[model], target_names=list_classes))

DecisionTreeClassifier


KeyError: 'DecisionTreeClassifier'