In [1]:
import numpy as np
import pandas as pd
import pickle
import re
import string
import unicodedata
import emoji
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
data = pd.read_csv('../data/train.csv')
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

def use_emoji(text):

    text=emoji.demojize(str(text))
    text=re.sub(':','',text)
    text=re.sub(r'\:(.*?)\:','',text)
    
    return text

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    only_ascii=re.sub(' +', ' ',(only_ascii.decode('utf-8')))
    return only_ascii

In [5]:
data['comment_text'] = data['comment_text'].apply(clean_text)
data['comment_text'] = data['comment_text'].apply(use_emoji)
data['comment_text'] = data['comment_text'].apply(remove_accents)

In [6]:
# To find y values for converting problem into Binary
temp = data.sum(axis=1,numeric_only=True)

y = temp.values
y[y > 0] = 1
np.unique(y,return_counts=True)

(array([0, 1], dtype=int64), array([143346,  16225], dtype=int64))

In [7]:
# Right now we are only interrested in finding if a comment is toxic or not so we extract only the comment col and toxicity col
x = data["comment_text"].values

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [9]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(127656,) (127656,)
(31915,) (31915,)


In [10]:
np.unique(y_test,return_counts="true")

(array([0, 1], dtype=int64), array([28678,  3237], dtype=int64))

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [20]:
class Model:
    
    def __init__(self):
        self.tfidf = TfidfVectorizer(sublinear_tf=True,min_df=4,norm='l2', ngram_range=(1, 2), stop_words='english')
        self.lsvc = LinearSVC(max_iter=5000)
    
    def fit(self,x,y):
        
        x_vec = self.tfidf.fit_transform(x)
        self.lsvc.fit(x_vec,y)
    
    def clean_text(self,text):
        '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.'''
        text = text.lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', ' ', text)
        text = re.sub('\w*\d\w*', '', text)

        return text

    def use_emoji(self,text):

        text=emoji.demojize(str(text))
        text=re.sub(':','',text)
        text=re.sub(r'\:(.*?)\:','',text)

        return text

    def remove_accents(self,input_str):
        
        nfkd_form = unicodedata.normalize('NFKD', input_str)
        only_ascii = nfkd_form.encode('ASCII', 'ignore')
        only_ascii=re.sub(' +', ' ',(only_ascii.decode('utf-8')))
        return only_ascii
    
    def text_preProcess(self,text):
        
        text = self.clean_text(text)
        text = self.use_emoji(text)
        text = self.remove_accents(text)
        
        return text
    
    def predict(self,comment):
        
        comment = self.text_preProcess(comment)
        
        # This step is required because tfidf.transform takes a list as an argument.
        temp = []
        temp.append(comment)
        
        xt_vec = self.tfidf.transform(temp)
        pred = self.lsvc.predict(xt_vec)
        
        return pred

In [21]:
model = Model()

In [22]:
model.fit(x_train,y_train)

In [23]:
y_pred = []

for test_comment in x_test:
    
    pred = model.predict(test_comment)
    y_pred.append(pred)

In [24]:
f1_score(y_test,y_pred)

0.7806987201660325

In [25]:
confusion_matrix(y_test,y_pred)

array([[28390,   288],
       [  980,  2257]], dtype=int64)

In [26]:
def predict(classifier,comment):
    
    pred = classifier.predict(comment)
    
    if pred == 1:
        return "Toxic"
    else:
        return "Non Toxic"

In [27]:
predict(model,"Can you please shut up?")

'Toxic'

In [28]:
predict(model,"Thanks a lot for explaining!")

'Non Toxic'

In [29]:
predict(model,"man you lack brain cells")

'Toxic'

In [30]:
predict(model,"I really liked your approach to the problem.")

'Non Toxic'

In [31]:
predict(model,"You're ugly")

'Toxic'

In [32]:
predict(model,"Kill yourself")

'Toxic'

From the above confusion matrix we can clearly see that a lot of Toxic comments are being classified as Non-Toxic. One of the reason for this could be because in our training sample we have more than 1.4 lakh samples of Non-toxic Class and only 16K samples of Toxic Class.

Let's see if we can improve our results by decreasing the samples of Non-Toxic Class.

In [33]:
# First filtering comments into non_toxic and toxic class.
non_toxic = x[y==0]
toxic = x[y==1]

In [34]:
# Randomly selecting 20000 non toxic comments.
ids = np.arange(non_toxic.shape[0])
np.random.shuffle(ids)
ids = ids[:20000]

non_toxic_selected = []

for i in ids:
    non_toxic_selected.append(non_toxic[i])

non_toxic_selected = np.asarray(non_toxic_selected)
print(non_toxic_selected.shape,toxic.shape)

(20000,) (16225,)


In [35]:
# Merging non toxic and toxic comments into one array
x_new = np.concatenate((non_toxic_selected,toxic))
print(x_new.shape)

(36225,)


In [36]:
y_new = np.zeros((36225,),dtype="int")
y_new[20000:] = 1

In [37]:
x_train2,x_test2,y_train2,y_test2 = train_test_split(x_new,y_new,test_size=0.2)

In [38]:
print(x_train2.shape,y_train2.shape)
print(x_test2.shape,y_test2.shape)

(28980,) (28980,)
(7245,) (7245,)


In [39]:
np.unique(y_test,return_counts="true")

(array([0, 1], dtype=int64), array([28678,  3237], dtype=int64))

In [40]:
model2 = Model()

In [41]:
model2.fit(x_train2,y_train2)

In [42]:
y_pred2 = []

for test_comment in x_test2:
    
    pred = model2.predict(test_comment)
    y_pred2.append(pred)

In [43]:
f1_score(y_test2,y_pred2)

0.8773151812569258

In [44]:
confusion_matrix(y_test2,y_pred2)

array([[3699,  267],
       [ 508, 2771]], dtype=int64)

As expected our predictions have improved.

What if we further decrease sample size of Non-Toxic Class? 

In [45]:
# Randomly selecting 17000 non toxic comments.
ids = np.arange(non_toxic.shape[0])
np.random.shuffle(ids)
ids = ids[:17000]

non_toxic_selected = []

for i in ids:
    non_toxic_selected.append(non_toxic[i])

non_toxic_selected = np.asarray(non_toxic_selected)
print(non_toxic_selected.shape,toxic.shape)

(17000,) (16225,)


In [46]:
# Merging non toxic and toxic comments into one array
x_new2 = np.concatenate((non_toxic_selected,toxic))
print(x_new2.shape)

(33225,)


In [47]:
y_new2 = np.zeros((33225,),dtype="int")
y_new2[17000:] = 1

In [48]:
x_train3,x_test3,y_train3,y_test3 = train_test_split(x_new2,y_new2,test_size=0.2)

In [49]:
print(x_train3.shape,y_train3.shape)
print(x_test3.shape,y_test3.shape)

(26580,) (26580,)
(6645,) (6645,)


In [50]:
model3 = Model()

In [51]:
model3.fit(x_train3,y_train3)

In [52]:
y_pred3 = []

for test_comment in x_test3:
    
    pred = model3.predict(test_comment)
    y_pred3.append(pred)

In [53]:
f1_score(y_test3,y_pred3)

0.8879759835677042

In [54]:
confusion_matrix(y_test3,y_pred3)

array([[3126,  278],
       [ 431, 2810]], dtype=int64)

Our score has improved by 1% but our precision has decreased i.e. chances of our model classifying a non-toxic comment as toxic have increased which we don't want. So we're gonna go with Model2

In [55]:
predict(model2,"Can you please shut up?")

'Toxic'

In [56]:
predict(model2,"Thanks a lot for explaining!")

'Non Toxic'

In [57]:
predict(model2,"you're an idiot")

'Toxic'

In [58]:
predict(model2,"man you lack brain cells")

'Toxic'

In [59]:
predict(model2,"I really liked your approach to the problem.")

'Non Toxic'

In [60]:
predict(model2,"You're ugly")

'Toxic'

In [61]:
predict(model2,'Kill yourself')

'Toxic'