# Importing required libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import seaborn as sns

In [None]:
import numpy as np
import neattext as nt
import neattext.functions as nfx

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

# Loading the Data Set

In [None]:
df = pd.read_csv('train.csv')  #reading the dataset

In [None]:
df.head()

In [None]:
df['comment_text'].apply(lambda x:nt.TextFrame(x).noise_scan())     #noise present in the comment text

In [None]:
print(df.isnull().sum())

In [None]:
df['comment_text_noise'] = df['comment_text'].apply(lambda x:nt.TextExtractor(x).extract_stopwords())

In [None]:
df['comment_text_noise']   #extracted the stopwords from the comment text

In [None]:
df['comment_textnew'] = df['comment_text'].apply(nfx.remove_stopwords) #removing stopwords

In [None]:
#removing punctuation , special character and urls
df['comment_textnew'] = df['comment_textnew'].apply(nfx.remove_punctuations)
df['comment_textnew'] = df['comment_textnew'].apply(nfx.remove_special_characters)
df['comment_textnew'] = df['comment_textnew'].apply(nfx.remove_phone_numbers)
df['comment_textnew'] = df['comment_textnew'].apply(nfx.remove_urls)

# Data Analysis

In [None]:
sns.countplot(df['malignant'])

In [None]:
df['malignant'].value_counts()

In [None]:
sns.countplot(df['highly_malignant'])

In [None]:
df['highly_malignant'].value_counts()

In [None]:
sns.countplot(df['rude'])

In [None]:
df['rude'].value_counts()

In [None]:
sns.countplot(df['threat'])

In [None]:
df['threat'].value_counts()

In [None]:
sns.countplot(df['abuse'])

In [None]:
df['abuse'].value_counts()

In [None]:
sns.countplot(df['loathe'])

In [None]:
df['loathe'].value_counts()

#### For each of the label(though it is multi label classification analysing each seperately) Each label is unbalanced and zeros are outnumbered compared to one for each label

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
loathe = df['comment_text'][df['loathe']==1]

loathe_cloud = WordCloud(width=700,height=500,background_color='white',max_words=15).generate(' '.join(loathe))

plt.figure(figsize=(10,8),facecolor='r')
plt.imshow(loathe_cloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
malignant = df['comment_text'][df['malignant']==1]

malignant_cloud = WordCloud(width=700,height=500,background_color='white',max_words=15).generate(' '.join(malignant))

plt.figure(figsize=(10,8),facecolor='r')
plt.imshow(malignant_cloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
highly_malig = df['comment_text'][df['highly_malignant']==1]

hmali_cloud = WordCloud(width=700,height=500,background_color='white',max_words=15).generate(' '.join(highly_malig))

plt.figure(figsize=(10,8),facecolor='r')
plt.imshow(hmali_cloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
threat = df['comment_text'][df['threat']==1]

threat_cloud = WordCloud(width=700,height=500,background_color='white',max_words=15).generate(' '.join(threat))

plt.figure(figsize=(10,8),facecolor='r')
plt.imshow(threat_cloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
highly_malig = df['comment_text'][df['highly_malignant']==1][df['abuse']==0]

hmali_cloud = WordCloud(width=700,height=500,background_color='white',max_words=15).generate(' '.join(highly_malig))

plt.figure(figsize=(10,8),facecolor='r')
plt.imshow(hmali_cloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
all_one = df['comment_text'][df['highly_malignant']==1][df['abuse']==1][df['threat']==1][df['rude']==1][df['malignant']==1][df['loathe']==1]

all_cloud = WordCloud(width=700,height=500,background_color='white',max_words=15).generate(' '.join(all_one))

plt.figure(figsize=(10,8),facecolor='r')
plt.imshow(all_cloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
rude = df['comment_text'][df['rude']==1]

rudecloud = WordCloud(width=700,height=500,background_color='white',max_words=15).generate(' '.join(rude))

plt.figure(figsize=(10,8),facecolor='r')
plt.imshow(rudecloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

#### Most frequents words used for each label is are displayed in the word cloud based on different label and also when all the values are present

In [None]:
df['length'] = df.comment_text.str.len()
df['clean_length'] = df.comment_textnew.str.len()
df.head()

Comparing the length of comments after removing the stopwords

In [None]:
df1 = df[:80000]

# Model Building

In [None]:
tfidf = TfidfVectorizer()
features = tfidf.fit_transform(df1['comment_textnew'])

In [None]:
y = df1[['malignant', 'highly_malignant', 'rude', 'threat',
       'abuse', 'loathe']]

In [None]:
x_train,x_test,Y_train,y_test = train_test_split(features,y,test_size=0.33,random_state=42)

In [None]:
clf = OneVsRestClassifier(SVC()).fit(x_train,Y_train) 

In [None]:
p = clf.predict(x_test)

In [None]:
print(accuracy_score(p,y_test))

In [None]:
print(classification_report(p,y_test))

In [None]:
from sklearn.metrics import hamming_loss

In [None]:
print("Hamming_loss:", hamming_loss(y_test,p))

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

In [None]:
multilabel_confusion_matrix(y_test,p)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
clfd = OneVsRestClassifier(DecisionTreeClassifier()).fit(x_train,Y_train)

In [None]:
p1 = clfd.predict(x_test)
print("Accuracy score",accuracy_score(p1,y_test))
print("Hamming_loss:", hamming_loss(y_test,p1))

In [None]:
clfk = OneVsRestClassifier(KNeighborsClassifier()).fit(x_train,Y_train)
p2 = clfk.predict(x_test)
print("Accuracy score",accuracy_score(p2,y_test))
print("Hamming_loss:", hamming_loss(y_test,p2))

Accuracy scores of different algorithms :


 - SVC : 91.06 
 
 - DecisionTree : 87.25 
  
 - KNN : 88.73

# Binary relevance

In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

In [None]:
classifier = BinaryRelevance(GaussianNB())
classifier.fit(x_train, Y_train)
predictions = classifier.predict(x_test)


print("Accuracy = ",accuracy_score(y_test,predictions))

# Label Powerset

In [None]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.linear_model import LogisticRegression


classifier = LabelPowerset(LogisticRegression())
classifier.fit(x_train, Y_train)
predictions = classifier.predict(x_test)
print("Accuracy = ",accuracy_score(y_test,predictions))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

# Ensembe Techniques

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100)
clfrf = OneVsRestClassifier(rf_classifier).fit(x_train,Y_train)
prf = clfrf.predict(x_test)
print("Accuracy score",accuracy_score(prf,y_test))
print("Hamming_loss:", hamming_loss(y_test,prf))

In [None]:
grd_boost = GradientBoostingClassifier(n_estimators=100,random_state=0)

In [None]:
clfgrd = OneVsRestClassifier(grd_boost).fit(x_train,Y_train)
pgrd = clfgrd.predict(x_test)
print("Accuracy score",accuracy_score(pgrd,y_test))
print("Hamming_loss:", hamming_loss(y_test,pgrd))

In [None]:
ada_boost = AdaBoostClassifier(n_estimators=100,random_state=0)

In [None]:
clfada = OneVsRestClassifier(ada_boost).fit(x_train,Y_train)
pada = clfada.predict(x_test)
print("Accuracy score",accuracy_score(pada,y_test))
print("Hamming_loss:", hamming_loss(y_test,pada))

#### Each ensemble technique is giving almost same accuracy score

In [None]:
df2 = df[80000:]   #building model for other half of model

In [None]:
tfidf = TfidfVectorizer()
features2 = tfidf.fit_transform(df2['comment_text'])

In [None]:
y = df2[['malignant', 'highly_malignant', 'rude', 'threat',
       'abuse', 'loathe']]

In [None]:
x_train,x_test,Y_train,y_test = train_test_split(features2,y,test_size=0.33,random_state=42)

In [None]:
clf = OneVsRestClassifier(SVC()).fit(x_train,Y_train)

In [None]:
p = clf.predict(x_test)
print("accuracy score:",accuracy_score(p,y_test))
print(classification_report(p,y_test))
print("Hamming_loss:", hamming_loss(y_test,p))

In [None]:
import pickle

In [None]:
malignant_comments = 'comments.pkl'
pickle.dump(clf,open(malignant_comments,'wb'))

# Importing test data

In [None]:
dftest = pd.read_csv('testmalig.csv')

In [None]:
dftest.head()

In [None]:
print(dftest.isnull().sum())

In [None]:
dfn = df1.append(dftest)   #combining training and testing data before preprocessing

In [None]:
dfn['comment_textnew'] = dfn['comment_text'].apply(nfx.remove_stopwords)

In [None]:
dfn['comment_textnew'] = dfn['comment_textnew'].apply(nfx.remove_punctuations)
dfn['comment_textnew'] = dfn['comment_textnew'].apply(nfx.remove_special_characters)
dfn['comment_textnew'] = dfn['comment_textnew'].apply(nfx.remove_phone_numbers)
dfn['comment_textnew'] = dfn['comment_textnew'].apply(nfx.remove_urls)

In [None]:
dftest['length'] = dftest.comment_text.str.len()
dftest['clean_length'] = dftest.comment_textnew.str.len()
dftest.head()

In [None]:
y = dfn[['malignant', 'highly_malignant', 'rude', 'threat',
       'abuse', 'loathe']]

In [None]:
tfidf = TfidfVectorizer()
featur = tfidf.fit_transform(dfn['comment_textnew'])

In [None]:
y1=  y[:80000]   #seperating only training data for model building

In [None]:
f = featur[:80000]   #seperating features for buiding the model

In [None]:
x_train,x_test,Y_train,y_test = train_test_split(f,y1,test_size=0.33,random_state=42)

In [None]:
clf = OneVsRestClassifier(SVC()).fit(x_train,Y_train)   #fitting the data for training data

In [None]:
pred_test = clf.predict(featur[80001:])   #predicting data for test dataset

In [None]:
pdtest = pd.DataFrame(pred_test)

In [None]:
pdtest.head()