# Import Libraries

In [None]:
!pip install bs4

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
from collections import Counter
from imblearn.over_sampling import SMOTE

import pickle

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression,SGDClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.svm import SVC

In [None]:
# read dataset
df = pd.read_csv('../input/jigsaw-toxic-comment-train-and-test/train.csv')

# first few rows
df.head()

So in this notebook we are going to focus on weather a comment is toxic or not.We only need toxic and commment column so we will going to others drop

In [None]:
# drop columns
df.drop(['id','severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [None]:
# shape of the dataset
df.shape

Let's see weather our dataset is balanced or imbalanced

In [None]:
sns.countplot(df['toxic'])

In [None]:
df['toxic'].value_counts()

From above we can clearly see that our dataset is imbalanced dataset.We will later handle with it till then we will do data visualization after all first we need to understand our data

# Data Visualization

In [None]:
df['Number_of_words'] = df['comment_text'].apply(lambda x:len(str(x).split()))
df.head()

In [None]:
df.describe()

From above we can see that we have maximum 1411 words in our sentence and average length is 67.The minimum words in sentence is 1,let's see what are those sentences and how many sentences are there

In [None]:
print('Number of sentences having one word are',len(df[df['Number_of_words']==1]))

In [None]:
df[df['Number_of_words']==1]['comment_text']

So basically there are link,random words and numbers, so there are no one word sentence having some meaning

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(12,6))
sns.distplot(df['Number_of_words'],kde = False,color="red",bins=200)
plt.title("Frequency distribution of number of words for each text extracted", size=20)

Now let's see toxic and non-toxic comments

In [None]:
# toxic comments
toxic_comments = df[df['toxic'] ==1]['comment_text']
toxic_comments.reset_index(inplace=True,drop=True)
for i in range(5):
    print(toxic_comments[i])

In [None]:
# non toxic comments
non_toxic_comments = df[df['toxic'] ==0]['comment_text']
non_toxic_comments.reset_index(inplace=True,drop=True)
for i in range(5):
    print(non_toxic_comments[i])

### Number of characters in sentence

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
text_len=df[df['toxic']==1]['comment_text'].str.len()
ax1.hist(text_len,color='orange')
ax1.set_title('Toxic Comment')
text_len=df[df['toxic']==0]['comment_text'].str.len()
ax2.hist(text_len,color='yellow')
ax2.set_title('Non-Toxic Commet')
fig.suptitle('Characters in Sentence')
plt.show()

### Number of words in each text

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
text_len=df[df['toxic']==1]['comment_text'].str.split().map(lambda x: len(x))
ax1.hist(text_len,color='red')
ax1.set_title('Toxic Comments')
text_len=df[df['toxic']==0]['comment_text'].str.split().map(lambda x: len(x))
ax2.hist(text_len,color='b')
ax2.set_title('Non-Toxic Comment')
fig.suptitle('Words in Sentence')
plt.show()

## Tri-gram

In [None]:
# toxic
toxic_text = ' '.join(df.loc[df.toxic == 1, 'comment_text'].values)
toxic_text_trigrams = [i for i in ngrams(toxic_text.split(), 3)]
Counter(toxic_text_trigrams).most_common(30)

In [None]:
# non-toxic
non_toxic_text = ' '.join(df.loc[df.toxic == 0, 'comment_text'].values)
non_toxic_text_trigrams = [i for i in ngrams(non_toxic_text.split(), 3)]
Counter(non_toxic_text_trigrams).most_common(30)

# WordCloud

In [None]:
# word cloud of toxic and non-toxic comment
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[20, 5])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(toxic_comments))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Toxic Comments',fontsize=40);

wordcloud2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(non_toxic_comments))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Non Toxic Comments',fontsize=40);

# Data Cleaning

It's time to clean our dataset

In [None]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop and i.strip().lower().isalpha():
            final_text.append(i.strip().lower())
    return " ".join(final_text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text
#Apply function on review column
df['comment_text']=df['comment_text'].apply(denoise_text)

Let's see our cleaned data

In [None]:
print('ORIGINAL SENTENCE :',non_toxic_comments[0])
print('-'*100)
print('CLEANED SENTENCE :',df['comment_text'][0])

# Model

In [None]:
# dependent and independent variable
X = df['comment_text']
y = df['toxic']

In [None]:
# countvectorizer
cv = CountVectorizer()
X = cv.fit_transform(X)

In [None]:
smote = SMOTE(random_state = 402)
X_smote, Y_smote = smote.fit_resample(X,y)


sns.countplot(Y_smote)

In [None]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X_smote, Y_smote, test_size = 0.20, random_state = 0)

## Logistic Regression

In [None]:
lr = LogisticRegression()
#Fitting the model 
lr.fit(X_train,y_train)

In [None]:
# Predicting the Test set results
y_pred_lr = lr.predict(X_test)

In [None]:
# Accuracy, Precision,f1 and Recall
score1 = accuracy_score(y_test,y_pred_lr)
score2 = precision_score(y_test,y_pred_lr)
score3= recall_score(y_test,y_pred_lr)
score4 = f1_score(y_test,y_pred_lr)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))
print("Precision score is: {}".format(round(score2,2)))
print("Recall score is: {}".format(round(score3,2)))
print("F1 Score score is: {}".format(round(score4,2)))

## Naive Bayes

In [None]:
# Fitting Naive Bayes to the Training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred_nb = classifier.predict(X_test)

In [None]:
# Accuracy, Precision,f1 and Recall
score1 = accuracy_score(y_test,y_pred_nb)
score2 = precision_score(y_test,y_pred_nb)
score3 = recall_score(y_test,y_pred_nb)
score4 = f1_score(y_test,y_pred_nb)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))
print("Precision score is: {}".format(round(score2,2)))
print("Recall score is: {}".format(round(score3,2)))
print("F1 Score score is: {}".format(round(score4,2)))

## XgbClassifier

In [None]:
# xgbClassifier
clf = XGBClassifier()
clf.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred_xg = classifier.predict(X_test)

In [None]:
# Accuracy, Precision,f1 and Recall
score1 = accuracy_score(y_test,y_pred_xg)
score2 = precision_score(y_test,y_pred_xg)
score3= recall_score(y_test,y_pred_xg)
score4 = f1_score(y_test,y_pred_nb)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))
print("Precision score is: {}".format(round(score2,2)))
print("Recall score is: {}".format(round(score3,2)))
print("F1 Score score is: {}".format(round(score4,2)))

# Save Model

In [None]:
# open a file, where you want to store the data
file = open('toxic_comments.pkl', 'wb')

# dump information to that file
pickle.dump(clf, file)

In [None]:
pickle.dump(cv, open('transform.pkl', 'wb'))

### Flask application
So basically I have also made a flask application for this problem by writing whole code in colab.If you want to know how to run a flask application in colab,then click on this link: https://www.kaggle.com/dikshabhati2002/run-flask-in-colab?scriptVersionId=55081927