In [None]:
# Librarires
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
!pip install clean-text
from tqdm import tqdm
tqdm.pandas()

In [None]:
# collecting data from diffrent resources, removing duplicates and then merging them into a single dataframe

# data from Jigsaw Multilingual Toxic Comment Classification"
df1a = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv')
df1b = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv')
df1 = pd.concat([df1a,df1b])
df1 = df1.dropna()
df = df1[['comment_text','toxic']]
df['toxic'] = df['toxic'].apply(lambda x: int(x))
# removing duplicates from data
df.drop_duplicates(subset =["comment_text",'toxic'],keep = False, inplace = True)
df.head()

In [None]:
# visualizing Data Distribution
fig = px.pie(values=[len(df[df['toxic']==0]),len(df[df['toxic']==1])], names=['Non-toxic','Toxic'], title='Distribution of Toxic and Non-toxic comments')
fig.show()

In [None]:
# we can clearly see that the data is unbalenced first we have balence it
counts =  df[df['toxic'] != 0].shape[0]
non_toxic = df[df['toxic']==0].sample(counts)
toxic = df[df['toxic']==1]
df = pd.concat([toxic,non_toxic])
fig = px.pie(values=[len(df[df['toxic']==0]),len(df[df['toxic']==1])], names=['Non-toxic','Toxic'], title='Distribution of Toxic and Non-toxic comments')
fig.show()

In [None]:
# now we will create additional features from the given text which will help 
# us during the visualization of the model

#1 Number of characters
df['no_of_char'] = df['comment_text'].progress_apply(lambda x:len(x))

#2 Number of words
df['no_of_words'] = df['comment_text'].progress_apply(lambda x:len(x.split()))

#3 Number of Capital Characters
df['no_of_cap_chars'] = df['comment_text'].progress_apply(lambda x:sum([1 for i in x if i.isupper()]))

#4 Number of Capital words
df['no_of_cap_words'] = df['comment_text'].progress_apply(lambda x:sum([1 for i in x.split() if i.isupper()]))

#5 Number of punctuation
def no_of_punc(text):
    import string
    punctuations=string.punctuation
    d = 0
    for i in text:
        if i in punctuations:
            d+=1
    return d 

df['no_of_punctuations'] = df['comment_text'].progress_apply(lambda x:no_of_punc(x))

#6 number of stopwords
def count_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)
df['no_of_stopwords'] = df['comment_text'].progress_apply(lambda x:count_stopwords(x))

#7 number of unique words
df['no_of_unique_words'] = df['comment_text'].progress_apply(lambda x:len(set(x.split())))

#8 avg word length
df['avg_word_length'] = round(df['no_of_char']/df['no_of_words'],3)

#9 ratio of unique word and total words
df['unique_vs_words'] = round(df['no_of_unique_words']/df['no_of_words'],3)

#10 ratio of stopwords and totalwords
df['stop_vs_words'] = round(df['no_of_stopwords']/df['no_of_words'],3)

In [None]:
# in this block we will do preprocessing on the text
#lowering the text
df['comment_text'] = df['comment_text'].progress_apply(lambda x:x.lower())

# removing punctuation
df['comment_text'] = df['comment_text'].progress_apply(lambda x:re.sub("[^-9A-Za-z ]", "" , x))

# tokenizeing 
df['comment_text'] = df['comment_text'].progress_apply(lambda x:word_tokenize(x))

# removing stopwords
stop_words = set(stopwords.words('english'))
df['comment_text'] = df['comment_text'].progress_apply(lambda x:[i for i in x if i not in set(stopwords.words('english'))])

# removing the epmty tokens
df = df[df['comment_text'].map(len)>0]

# now cleaning text with help cleantext
from cleantext import clean

def func90(s1):
    s1 =" ".join(s1)
    clean_text = clean(s1, 
      fix_unicode=True, 
      to_ascii=True, 
      no_line_breaks=True,
      no_urls=True, 
      no_numbers=True, 
      no_digits=True,  
      no_currency_symbols=True, 
      no_punct=True, 
      replace_with_punct="", 
      replace_with_url="", 
      replace_with_number="", 
      replace_with_digit="", 
      replace_with_currency_symbol="",
      lang='en')
    return word_tokenize(clean_text)
df['comment_text'] = df['comment_text'].progress_apply(lambda x:func90(x))

# lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def func10(s):
    text = [lemmatizer.lemmatize(i) for i in s]
    return text

df['comment_text'] = df['comment_text'].progress_apply(lambda x:func10(x))

In [None]:
fig = px.area(df, y="no_of_char", color="toxic",title='Number of Characters in Comments (toxic/non-toxic)')
fig.show()

In [None]:
fig = px.area(df, y="no_of_words", color="toxic",title='Number of words in Comments (toxic/non-toxic)')
fig.show()

In [None]:
fig = px.area(df, y="no_of_cap_chars", color="toxic",title='Number of Capital Characters in Comments (toxic/non-toxic)')
fig.show()

In [None]:
fig = px.area(df, y="no_of_cap_words", color="toxic",title='Number of Capital Words in Comments (toxic/non-toxic)')
fig.show()

In [None]:
fig = px.area(df, y="no_of_punctuations", color="toxic",title='Number of Pucntuations in Comments (toxic/non-toxic)')
fig.show()

In [None]:
fig = px.area(df, y="no_of_stopwords", color="toxic",title='Number of StopWords in Comments (toxic/non-toxic)')
fig.show()

In [None]:
fig = px.area(df, y="no_of_unique_words", color="toxic",title='Number of Unique Words in Comments (toxic/non-toxic)')
fig.show()

In [None]:
fig = px.area(df, y="avg_word_length", color="toxic",title='Avg Words Length in Comments (toxic/non-toxic)')
fig.show()

In [None]:
fig = px.area(df, y="unique_vs_words", color="toxic",title='Unique vs StopWords in Comments (toxic/non-toxic)')
fig.show()

In [None]:
fig = px.area(df, y="stop_vs_words", color="toxic",title='StopWords vs Words in Comments (toxic/non-toxic)')
fig.show()

In [None]:
# we will make two models one simple ann and another one text cnn,
# we wil train ann on feautres we created with help of text data
# and text cnn will be trained on text data

In [None]:
# preparing validation data given
# seprating toxic data an giving them value
df2toxic = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
df2toxic['comment_text'] = df2toxic['more_toxic']
df2toxic = df2toxic.drop(['worker','less_toxic','more_toxic'],axis=1)
df2toxic['toxic'] = df2toxic['comment_text'].apply(lambda x:1)

# seprating non toxic data and giving them their value
df2nontoxic = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
df2nontoxic['comment_text'] = df2nontoxic['less_toxic']
df2nontoxic = df2nontoxic.drop(['worker','less_toxic','more_toxic'],axis=1)
df2nontoxic['toxic'] = df2nontoxic['comment_text'].apply(lambda x:0)

# merging the toxic and non toxic data
df2 = pd.concat([df2toxic,df2nontoxic])


# we will do preprocessing like we did on train data
df2['comment_text'] = df2['comment_text'].progress_apply(lambda x:x.lower())
# removing punctuation
df2['comment_text'] = df2['comment_text'].progress_apply(lambda x:re.sub("[^-9A-Za-z ]", "" , x))
# tokenizeing 
df2['comment_text'] = df2['comment_text'].progress_apply(lambda x:word_tokenize(x))
# removing stopwords
df2['comment_text'] = df2['comment_text'].progress_apply(lambda x:[i for i in x if i not in set(stopwords.words('english'))])

# removing the epmty tokens
df2 = df2[df2['comment_text'].map(len)>0]
# now cleaning text with help cleantext
df2['comment_text'] = df2['comment_text'].progress_apply(lambda x:func90(x))
# lemmatization
df2['comment_text'] = df2['comment_text'].progress_apply(lambda x:func10(x))


In [None]:
# train data
x1 = df['comment_text']
y1 = df['toxic']

# valid data 
x2 = df2['comment_text']
y2 = df2['toxic']

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=7000)
tokenizer.fit_on_texts(x1)
Xcnn_train = tokenizer.texts_to_sequences(x1)
Xcnn_test = tokenizer.texts_to_sequences(x2)
vocab_size = len(tokenizer.word_index) + 1 

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
maxlen = 100
Xcnn_train = pad_sequences(Xcnn_train, padding='post', maxlen=maxlen)
Xcnn_test = pad_sequences(Xcnn_test, padding='post', maxlen=maxlen)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Conv1D, GlobalMaxPooling1D,Dense,Dropout,LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping

embedding_dim = 200
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlen),
    Conv1D(128, 5),
    LeakyReLU(alpha=0.05),
    GlobalMaxPooling1D(),
    Dense(10),
    LeakyReLU(alpha=0.05),
    Dense(1, activation='sigmoid')
])

In [None]:
model.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['accuracy'])

In [None]:
model.summary() 

In [None]:
model_perfomence = model.fit(Xcnn_train,
    y=y1,
    validation_data=(Xcnn_test,
    y2),
    epochs=15,
    callbacks=EarlyStopping(patience=3, 
                   monitor='loss', 
                   restore_best_weights=True, 
                   mode='min', 
                   verbose=1)
 )

In [None]:
def vis_data(datah):
    import plotly.express as px
    fig = px.line(datah.history, y=['accuracy','val_accuracy'],title='Trainng & Validation accuracy')
    fig.show()

In [None]:
vis_data(model_perfomence)