In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
df= pd.read_csv('malign_comments_train.csv')

In [3]:
df.isnull().sum()

id                  0
comment_text        0
malignant           0
highly_malignant    0
rude                0
threat              0
abuse               0
loathe              0
dtype: int64

In [4]:
import re
import nltk
from nltk.corpus import  stopwords

stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [5]:
def text_cleaner(text):
    clean_text = re.sub(r'@[A-Za-z0-9]+','',text)
    clean_text = re.sub('#','',clean_text)
    clean_text = re.sub(r"'s\b",'',clean_text)
    clean_text = re.sub(r'[%$#@&}{]','',clean_text)
    clean_text = re.sub(r'[.,:;!]','',clean_text)
    letters_only = re.sub("[^a-zA-Z]",' ',clean_text)
    
    lower_case = letters_only.lower()
    tokens = [w for w in lower_case.split() if not w in stop_words]
    clean_text=''
    for i in tokens:
        clean_text = clean_text + lemmatizer.lemmatize(i)+ ' '
    return clean_text.strip()

In [9]:
cleaned_text=[]
for i in df['comment_text']:
    cleaned_text.append(text_cleaner(i))

In [10]:
df['cleaned_comments'] = cleaned_text

In [11]:
# 1. Convert text into vectors using TF-IDF
# 2. Instantiate MultinomialNB classifier
# 3. Split feature and label
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

tf_vec = TfidfVectorizer()

naive = MultinomialNB()

features = tf_vec.fit_transform(df['cleaned_comments'])
X = features
y = df['malignant']



In [12]:
# Train and predict
X_train,x_test,Y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

naive.fit(X_train,Y_train)

y_pred= naive.predict(x_test)
print ('Final score = > ', accuracy_score(y_test,y_pred))

Final score = >  0.9202867671019979


In [13]:
X_train.shape,x_test.shape

((119678, 181680), (39893, 181680))

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     36069
           1       0.99      0.17      0.29      3824

    accuracy                           0.92     39893
   macro avg       0.95      0.59      0.62     39893
weighted avg       0.93      0.92      0.89     39893



In [15]:
from sklearn.model_selection import RandomizedSearchCV

In [17]:
from sklearn.neighbors import KNeighborsClassifier
Knn = KNeighborsClassifier(n_neighbors=2 ,weights='distance',algorithm='auto')
Knn.fit(X_train,Y_train)
predknn = Knn.predict(x_test)
predknn
print('accuracy_score',accuracy_score(predknn,y_test))
print(confusion_matrix(y_test,predknn))
print(classification_report(y_test,predknn))

accuracy_score 0.921264382222445
[[35767   302]
 [ 2839   985]]
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     36069
           1       0.77      0.26      0.39      3824

    accuracy                           0.92     39893
   macro avg       0.85      0.62      0.67     39893
weighted avg       0.91      0.92      0.90     39893



In [18]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500,max_features ='sqrt',max_depth = 8,criterion='entropy')

rf.fit(X_train,Y_train)
predrf = rf.predict(x_test)
predrf

print('predicted',predrf)
print('actual',y_test)

print(accuracy_score(y_test,predrf))
print(confusion_matrix(y_test,predrf))
print(classification_report(y_test,predrf))

predicted [0 0 0 ... 0 0 0]
actual 48581     0
76053     0
3088      0
87356     0
101968    0
         ..
35002     0
89323     0
55330     0
3203      0
115491    0
Name: malignant, Length: 39893, dtype: int64
0.9041435840874339
[[36069     0]
 [ 3824     0]]
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     36069
           1       0.00      0.00      0.00      3824

    accuracy                           0.90     39893
   macro avg       0.45      0.50      0.47     39893
weighted avg       0.82      0.90      0.86     39893



In [19]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(criterion='entropy')
dct.fit(X_train,Y_train)
preddct = dct.predict(x_test)
preddct
print('accuracy_score',accuracy_score(preddct,y_test))
print(confusion_matrix(y_test,preddct))
print(classification_report(y_test,preddct))

accuracy_score 0.9420951044042815
[[35018  1051]
 [ 1259  2565]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     36069
           1       0.71      0.67      0.69      3824

    accuracy                           0.94     39893
   macro avg       0.84      0.82      0.83     39893
weighted avg       0.94      0.94      0.94     39893



In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
dctscores=cross_val_score(dct,X=X_train,y=Y_train,cv=5,scoring='accuracy')
print(dctscores)
print(dctscores.mean()*100,dctscores.std()*100)

[0.93791778 0.93871156 0.94163603 0.94288699 0.94008774]
94.02480195080483 0.1827955766098464


In [20]:
features = tf_vec.fit_transform(df['cleaned_comments'])
X = features
y = df['malignant']


In [25]:
X_test = tf_vec.transform(df1['cleaned_comments'])


In [26]:
X.shape,X_test.shape

((159571, 181680), (153164, 181680))

In [28]:
y_pred= naive.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [29]:
y_test

48581     0
76053     0
3088      0
87356     0
101968    0
         ..
35002     0
89323     0
55330     0
3203      0
115491    0
Name: malignant, Length: 39893, dtype: int64

In [19]:
df.columns

Index(['id', 'comment_text', 'malignant', 'highly_malignant', 'rude', 'threat',
       'abuse', 'loathe', 'cleaned_comments'],
      dtype='object')

## Highly Malignant 

In [22]:
y = df['highly_malignant']

In [23]:
# Train and predict
X_train,x_test,Y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

naive.fit(X_train,Y_train)

y_pred2= naive.predict(x_test)
print ('Final score = > ', accuracy_score(y_test,y_pred2))

Final score = >  0.9899731782518236


In [24]:
from sklearn.neighbors import KNeighborsClassifier
Knn = KNeighborsClassifier(n_neighbors=2 ,weights='distance',algorithm='auto')
Knn.fit(X_train,Y_train)
predknn2 = Knn.predict(x_test)
predknn2
print('accuracy_score',accuracy_score(predknn2,y_test))
print(confusion_matrix(y_test,predknn2))
print(classification_report(y_test,predknn2))

accuracy_score 0.987892612739077
[[39327   167]
 [  316    83]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     39494
           1       0.33      0.21      0.26       399

    accuracy                           0.99     39893
   macro avg       0.66      0.60      0.62     39893
weighted avg       0.99      0.99      0.99     39893



In [25]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(criterion='entropy')
dct.fit(X_train,Y_train)
preddct2 = dct.predict(x_test)
preddct2
print('accuracy_score',accuracy_score(preddct2,y_test))
print(confusion_matrix(y_test,preddct2))
print(classification_report(y_test,preddct2))

accuracy_score 0.9877923445215953
[[39299   195]
 [  292   107]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     39494
           1       0.35      0.27      0.31       399

    accuracy                           0.99     39893
   macro avg       0.67      0.63      0.65     39893
weighted avg       0.99      0.99      0.99     39893



## Rude

In [26]:
y = df['rude']

In [27]:
# Train and predict
X_train,x_test,Y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)
  
naive.fit(X_train,Y_train)

y_pred3= naive.predict(x_test)
print ('Final score = > ', accuracy_score(y_test,y_pred3))

Final score = >  0.9520467249893465


In [28]:
from sklearn.neighbors import KNeighborsClassifier
Knn = KNeighborsClassifier(n_neighbors=2 ,weights='distance',algorithm='auto')
Knn.fit(X_train,Y_train)
predknn3 = Knn.predict(x_test)

print('accuracy_score',accuracy_score(predknn3,y_test))
print(confusion_matrix(y_test,predknn3))
print(classification_report(y_test,predknn3))

accuracy_score 0.9552302408943925
[[37533   248]
 [ 1538   574]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     37781
           1       0.70      0.27      0.39      2112

    accuracy                           0.96     39893
   macro avg       0.83      0.63      0.68     39893
weighted avg       0.95      0.96      0.95     39893



In [29]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(criterion='entropy')
dct.fit(X_train,Y_train)
preddct3 = dct.predict(x_test)
preddct3
print('accuracy_score',accuracy_score(preddct3,y_test))
print(confusion_matrix(y_test,preddct3))
print(classification_report(y_test,preddct3))

accuracy_score 0.9740806657809641
[[37259   522]
 [  512  1600]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     37781
           1       0.75      0.76      0.76      2112

    accuracy                           0.97     39893
   macro avg       0.87      0.87      0.87     39893
weighted avg       0.97      0.97      0.97     39893



# Threat

In [30]:
y = df['threat']

In [31]:
# Train and predict
X_train,x_test,Y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

naive.fit(X_train,Y_train)

y_pred4= naive.predict(x_test)
print ('Final score = > ', accuracy_score(y_test,y_pred4))

Final score = >  0.9969668864211767


In [32]:
from sklearn.neighbors import KNeighborsClassifier
Knn = KNeighborsClassifier(n_neighbors=2 ,weights='distance',algorithm='auto')
Knn.fit(X_train,Y_train)
predknn4 = Knn.predict(x_test)
predknn4
print('accuracy_score',accuracy_score(predknn4,y_test))
print(confusion_matrix(y_test,predknn4))
print(classification_report(y_test,predknn4))

accuracy_score 0.9965908806056201
[[39747    26]
 [  110    10]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39773
           1       0.28      0.08      0.13       120

    accuracy                           1.00     39893
   macro avg       0.64      0.54      0.56     39893
weighted avg       1.00      1.00      1.00     39893



In [33]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(criterion='entropy')
dct.fit(X_train,Y_train)
preddct4 = dct.predict(x_test)
preddct4
print('accuracy_score',accuracy_score(preddct4,y_test))
print(confusion_matrix(y_test,preddct4))
print(classification_report(y_test,preddct4))

accuracy_score 0.9966159476599905
[[39727    46]
 [   89    31]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39773
           1       0.40      0.26      0.31       120

    accuracy                           1.00     39893
   macro avg       0.70      0.63      0.66     39893
weighted avg       1.00      1.00      1.00     39893



# Abuse

In [34]:
y = df['abuse']

In [35]:
# Train and predict
X_train,x_test,Y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

naive.fit(X_train,Y_train)

y_pred5= naive.predict(x_test)
print ('Final score = > ', accuracy_score(y_test,y_pred5))

Final score = >  0.9523725966961623


In [36]:
from sklearn.neighbors import KNeighborsClassifier
Knn = KNeighborsClassifier(n_neighbors=2 ,weights='distance',algorithm='auto')
Knn.fit(X_train,Y_train)
predknn5= Knn.predict(x_test)
predknn5
print('accuracy_score',accuracy_score(predknn5,y_test))
print(confusion_matrix(y_test,predknn5))
print(classification_report(y_test,predknn5))

accuracy_score 0.9570350688090642
[[37713   211]
 [ 1503   466]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     37924
           1       0.69      0.24      0.35      1969

    accuracy                           0.96     39893
   macro avg       0.83      0.62      0.67     39893
weighted avg       0.95      0.96      0.95     39893



In [37]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(criterion='entropy')
dct.fit(X_train,Y_train)
preddct5 = dct.predict(x_test)
preddct5
print('accuracy_score',accuracy_score(preddct5,y_test))
print(confusion_matrix(y_test,preddct5))
print(classification_report(y_test,preddct5))

accuracy_score 0.9616223397588549
[[37180   744]
 [  787  1182]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     37924
           1       0.61      0.60      0.61      1969

    accuracy                           0.96     39893
   macro avg       0.80      0.79      0.79     39893
weighted avg       0.96      0.96      0.96     39893



# Loathe

In [38]:
 y = df['loathe']

In [39]:
# Train and predict
X_train,x_test,Y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

naive.fit(X_train,Y_train)

y_pred6= naive.predict(x_test)
print ('Final score = > ', accuracy_score(y_test,y_pred6))

Final score = >  0.9911763968616047


In [40]:
from sklearn.neighbors import KNeighborsClassifier
Knn = KNeighborsClassifier(n_neighbors=2 ,weights='distance',algorithm='auto')
Knn.fit(X_train,Y_train)
predknn6 = Knn.predict(x_test)
predknn6
print('accuracy_score',accuracy_score(predknn6,y_test))
print(confusion_matrix(y_test,predknn6))
print(classification_report(y_test,predknn6))

accuracy_score 0.9908505251547891
[[39482    60]
 [  305    46]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     39542
           1       0.43      0.13      0.20       351

    accuracy                           0.99     39893
   macro avg       0.71      0.56      0.60     39893
weighted avg       0.99      0.99      0.99     39893



In [41]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(criterion='entropy')
dct.fit(X_train,Y_train)
preddct6 = dct.predict(x_test)
preddct6
print('accuracy_score',accuracy_score(preddct6,y_test))
print(confusion_matrix(y_test,preddct6))
print(classification_report(y_test,preddct6))

accuracy_score 0.9898729100343419
[[39383   159]
 [  245   106]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     39542
           1       0.40      0.30      0.34       351

    accuracy                           0.99     39893
   macro avg       0.70      0.65      0.67     39893
weighted avg       0.99      0.99      0.99     39893



In [42]:
df.columns

Index(['id', 'comment_text', 'malignant', 'highly_malignant', 'rude', 'threat',
       'abuse', 'loathe', 'cleaned_comments'],
      dtype='object')

In [43]:
list1=['MultinimialNB','Knn','DecisionTree']

In [44]:
list2 =[accuracy_score(y_test,y_pred),accuracy_score(predknn,y_test),accuracy_score(preddct,y_test)]

In [45]:
list3 =[accuracy_score(y_test,y_pred2),accuracy_score(predknn2,y_test),accuracy_score(preddct2,y_test)]

In [46]:
list4 =[accuracy_score(y_test,y_pred3),accuracy_score(predknn3,y_test),accuracy_score(preddct3,y_test)]

In [47]:
list5 =[accuracy_score(y_test,y_pred4),accuracy_score(predknn4,y_test),accuracy_score(preddct4,y_test)]

In [48]:
list6= [accuracy_score(y_test,y_pred5),accuracy_score(predknn5,y_test),accuracy_score(preddct5,y_test)]

In [49]:
list7 =[accuracy_score(y_test,y_pred6),accuracy_score(predknn6,y_test),accuracy_score(preddct6,y_test)]

In [50]:
model_performance=pd.DataFrame(list(zip(list1,list2,list3,list4,list5,list6,list7)),columns=['Model','malignment','highly_malignant','rude','threat','abuse','loathe'])

In [51]:
model_performance

Unnamed: 0,Model,malignment,highly_malignant,rude,threat,abuse,loathe
0,MultinimialNB,0.974808,0.991176,0.985812,0.991176,0.989372,0.991176
1,Knn,0.959291,0.985035,0.971048,0.990299,0.974783,0.990851
2,DecisionTree,0.901662,0.983782,0.938711,0.989271,0.943775,0.989873


In [30]:
# Now will predict our best performing model on test data.

In [52]:
df1= pd.read_csv('malign_comments_test.csv')

In [53]:
def text_cleaner(text):
    clean_text = re.sub(r'@[A-Za-z0-9]+','',text)
    clean_text = re.sub('#','',clean_text)
    clean_text = re.sub(r"'s\b",'',clean_text)
    clean_text = re.sub(r'[%$#@&}{]','',clean_text)
    clean_text = re.sub(r'[.,:;!]','',clean_text)
    letters_only = re.sub("[^a-zA-Z]",' ',clean_text)
    
    lower_case = letters_only.lower()
    tokens = [w for w in lower_case.split() if not w in stop_words]
    clean_text=''
    for i in tokens:
        clean_text = clean_text + lemmatizer.lemmatize(i)+ ' '
    return clean_text.strip()

In [54]:
cleaned_text1=[]
for i in df1['comment_text']:
    cleaned_text1.append(text_cleaner(i))
df1['cleaned_comments'] = cleaned_text1

In [55]:
X_test = tf_vec.transform(df1['cleaned_comments'])

In [56]:
X = features
y = df['malignant']

In [57]:
y_pred= naive.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# highly malignant

In [60]:
y = df['highly_malignant']

In [61]:
y_pred2= naive.predict(X_test)
y_pred2

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Rude

In [63]:
y = df['rude']

In [64]:
y_pred3= naive.predict(x_test)
y_pred3

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## threat

In [65]:
y=df['threat']

In [66]:
y_pred4= naive.predict(X_test)
y_pred4

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## abuse

In [67]:
y=df['abuse']

In [68]:
y_pred5 = naive.predict(X_test)
y_pred5

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## loathe

In [69]:
y=df['loathe']

In [70]:
y_pred6 = naive.predict(X_test)
y_pred6

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [58]:
df.columns

Index(['id', 'comment_text', 'malignant', 'highly_malignant', 'rude', 'threat',
       'abuse', 'loathe', 'cleaned_comments'],
      dtype='object')