In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input,  Activation
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import initializers, optimizers, layers
from sklearn.metrics import roc_auc_score

import warnings
warnings.simplefilter(action="ignore")



In [None]:
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

In [None]:
# Lets go through the data

train.head()

In [None]:
# Shape of the train set

train.shape

In [None]:
test.head()

In [None]:
# Shape of test set
test.shape

In [None]:
# Describing the  train data
train.describe()

In [None]:
# Getting the info about the train set
train.info()

From above we can see that there is no any missing values 
In fact we can check this too by using the commands of the next line



In [None]:
train.isnull().sum()

Yay! there is no missing values

# Preprocessing the Comments of the Train Set

Data preprocessing is one of the critical steps in any machine learning project. It includes cleaning and formatting the data before feeding into a machine learning algorithm. For NLP, the preprocessing steps are comprised of the following tasks:

* Tokenizing the string
* Lowercasing
* Removing stop words and punctuation
* Stemming


### Tokenize the String
To tokenize means to split the strings into individual words without blanks or tabs. In this same step, we will also convert each word in the string to lower case. The tokenize module from NLTK allows us to do these easily:


### Remove stop words and punctuations
The next step is to remove stop words and punctuation. Stop words are words that don't add significant meaning to the text. You'll see the list provided by NLTK when you run the cells below.

### Stemming
Stemming is the process of converting a word to its most general form, or stem. This helps in reducing the size of our vocabulary.

Consider the words:

* learn
* learning
* learned
* learnt
All these words are stemmed from its common root learn. However, in some cases, the stemming process produces words that are not correct spellings of the root word. For example, happi and sunni. That's because it chooses the most common stem for related words. For example, we can look at the set of words that comprises the different forms of happy:

* happy
* happiness
* happier
We can see that the prefix happi is more commonly used. We cannot choose happ because it is the stem of unrelated words like happen

In [None]:
# download the stopwords from NLTK
import nltk                                # Python library for NLP
nltk.download('stopwords')

In [None]:
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import regexp_tokenize   # module for tokenizing strings
from nltk.tokenize import TreebankWordTokenizer

Considering all the above mentioned method in single function process

In [None]:
# def process(comment):
#     """Process  function.
#     Input:
#         comment: a string containing a comment
#     Output:
#         comments_clean: a list of words containing the processed comment
#     """
#     stemmer = PorterStemmer()
#     stopwords_english = stopwords.words('english')
#     # remove stock market tickers like $GE
#     comment = re.sub(r'\$\w*', '', comment)
#     # remove old style text "RT"
#     comment = re.sub(r'^RT[\s]+', '', comment)
#     # remove hyperlinks
#     comment = re.sub(r'https?:\/\/.*[\r\n]*', '', comment)
#     # remove hashtags
#     # only removing the hash # sign from the word
#     comment = re.sub(r'#', '', comment)
#     # tokenize comments
#     tokenizer = TreebankWordTokenizer()
#     comment_tokens = tokenizer.tokenize(comment)

#     comments_clean = []
#     for word in comment_tokens:
#         if (word not in stopwords_english and  # remove stopwords
#                 word not in string.punctuation):  # remove punctuation
#             # tweets_clean.append(word)
#             stem_word = stemmer.stem(word)  # stemming word
#             comments_clean.append(stem_word)

#     return comments_clean

In [None]:
# train['comment_text'] = train['comment_text'].apply(lambda x: process(x))

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
# Applying the clean_text on train set

train['comment_text'] = train['comment_text'].apply(lambda x: clean_text(x))

In [None]:
train['comment_text'].head()

In [None]:
# Before Applying the clean_text on test set

test['comment_text'].head()

In [None]:
# Applying the clean_text on test set

test['comment_text'] = test['comment_text'].apply(lambda x: clean_text(x))

In [None]:
# After Applying the clean_text on test set

test['comment_text'].head()

The next step is to remove stop words. Stop words are words that don't add significant meaning to the text.

In [None]:
# Definig a function to remove the stopwords

def remove_stopwords(text):
    
    words = [word for word in text if word not in stopwords.words('english')]
    return words

In [None]:
# # Applying the remove_stopwords on train set

# train['comment_text'] = train['comment_text'].apply(lambda x: remove_stopwords(x))
# train.head()

In [None]:
# # Applying the remove_stopwords on test set

# test['comment_text'] = test['comment_text'].apply(lambda x: remove_stopwords(x))
# test.head()

In [None]:
# Checking the count of the various types of words

cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
targets = train[cols].values

train_df = train['comment_text']
test_df = test['comment_text']

In [None]:
val_counts = train[cols].sum()

plt.figure(figsize=(8,5))
ax = sns.barplot(val_counts.index, val_counts.values, alpha=0.8)

plt.title("Comments per Classes")
plt.xlabel("Various Comments Type")
plt.ylabel("Counts of the Comments")

rects = ax.patches
labels = val_counts.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height+5, label, ha="center", va="bottom")


plt.show()

From above we can see that most of the comments has been labelled as toxic in nature followed by obscence comments and insult comments

##### Plotting the word Cloud

In [None]:
# Word Cloud for train set

from wordcloud import WordCloud
words = ' '.join([text for text in train['comment_text'] ])


word_cloud = WordCloud(
                       width=1600,
                       height=800,
                       #colormap='PuRd', 
                       margin=0,
                       max_words=500, # Maximum numbers of words we want to see 
                       min_word_length=3, # Minimum numbers of letters of each word to be part of the cloud
                       max_font_size=150, min_font_size=30,  # Font size range
                       background_color="white").generate(words)

plt.figure(figsize=(10, 16))
plt.imshow(word_cloud, interpolation="gaussian")
plt.title('Comments and their Nature', fontsize = 40)
plt.axis("off")
plt.show()



In [None]:
# Word Cloud for test set

words = ' '.join([text for text in test['comment_text'] ])


word_cloud = WordCloud(
                       width=1600,
                       height=800,
                       #colormap='PuRd', 
                       margin=0,
                       max_words=500, # Maximum numbers of words we want to see 
                       min_word_length=3, # Minimum numbers of letters of each word to be part of the cloud
                       max_font_size=150, min_font_size=30,  # Font size range
                       background_color="white").generate(words)

plt.figure(figsize=(10, 16))
plt.imshow(word_cloud, interpolation="bilinear")
plt.title('Comments and their Nature', fontsize = 40)
plt.axis("off")
plt.show()

In [None]:
max_features = 22000
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(train_df))

tokenized_train = tokenizer.texts_to_sequences(train_df)
tokenized_test = tokenizer.texts_to_sequences(test_df)

In [None]:
maxlen = 200
X_train = pad_sequences(tokenized_train, maxlen = maxlen)
X_test = pad_sequences(tokenized_test, maxlen = maxlen)

In [None]:
embed_size = 128
maxlen = 200
max_features = 22000

inp = Input(shape = (maxlen, ))
x = Embedding(max_features, embed_size)(inp)
x = LSTM(60, return_sequences=True, name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

In [None]:
model = Model(inputs=inp, outputs=x)
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
model.summary()

In [None]:
batch_size = 64
epochs = 2
model.fit(X_train, targets, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [None]:
prediction = model.predict(X_test)
prediction

Guys if you feel that this notebook is useful please upvote

Feel free to Comment for any suggestions or queries