In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
import re, string

In [18]:
data = pd.read_csv('train.csv')

In [19]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [20]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'insult', 'identity_hate']

In [21]:
X = data[['comment_text']]
y = data[label_cols]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [23]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

In [24]:
def tokenize(s):
    return re_tok.sub(r'\1', s).split()

In [25]:
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

In [26]:
trn_vec_doc = vec.fit_transform(X_train['comment_text'])
test_vec_doc = vec.transform(X_test['comment_text'])

In [None]:
toxic_rfc = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
toxic_rfc.fit(trn_vec_doc, y_train['toxic'])

toxic_pred = toxic_rfc.predict(test_vec_doc)

print(accuracy_score(y_test['toxic'], toxic_pred),'\n',f1_score(y_test['toxic'], toxic_pred))

print(confusion_matrix(y_test['toxic'], toxic_pred))