# Introduction
This is a work where I have to classify toxic comment using different levels (multiclassification). This is an opportunity to use tensorflow skills.

# Data understanding

In [None]:
import pandas as pd
import os
import tensorflow as tf
import numpy as np

In [None]:
seed = 1
pathData = '../input/jigsaw-toxic-comment-classification-challenge'
batchSize = 128

In [None]:
pd.set_option('max_colwidth', 200)

## Training data

In [None]:
dsTrain = pd.read_csv(os.path.join(pathData, 'train.csv.zip'))
print('Shape:', dsTrain.shape)
dsTrain.head(20)

In [None]:
dsTrain.info()

Identifying empty comments

In [None]:
blanks = []
for index, id, text in dsTrain[['id', 'comment_text']].itertuples():
  newText = str(text)
  if newText.isspace():
    blanks.append(index)
print(f'Number of observations without text: {len(blanks)}')

Identifying comments with more than one classification

In [None]:
dsTrain['countToxic'] = dsTrain['toxic'] + dsTrain['severe_toxic'] + dsTrain['obscene'] + dsTrain['threat'] + dsTrain['insult'] + dsTrain['identity_hate']
dsTrainCount = dsTrain[['id', 'countToxic']].groupby('id').count().reset_index()
dsTrainCount[dsTrainCount['countToxic'] > 1]

Observations:
* There are no null values.
* There are no empty values.
* There are no observatio with more than one classification.

### Cleaning
In this step I will remove numbers and special characters, because these words do not help to understand toxic comments.

In [None]:
import re
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
def customCleaning(text):
  '''Function to get only valid words'''

  # Remove http texts
  text = re.sub(r'http\S+', ' ', text)

  # Remove numbers and special characters
  text = re.sub(r'[^A-Za-z\']+', ' ', text)
  
  text = text.lower()

  return text

In [None]:
dsTrain['comment_text'] = dsTrain['comment_text'].map(customCleaning)

In [None]:
dsTrain.head(20)

### Spliting data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = dsTrain['comment_text'].values
y = dsTrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = seed)

print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('y_train:', y_train.shape)
print('y_val:', y_val.shape)

### Tokenizer
In this step, I will turn words into numbers, where each word will has its id.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocabSize = 10000
embeddingDim = 128
maxLength = 120
truncType='post'
oovTok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words = vocabSize, oov_token=oovTok)
tokenizer.fit_on_texts(X_train)

#### Training

In [None]:
trainSequences = tokenizer.texts_to_sequences(X_train)
trainPadded = pad_sequences(trainSequences, maxlen=maxLength, truncating=truncType)

In [None]:
trainPadded.shape

#### Validation

In [None]:
valSequences = tokenizer.texts_to_sequences(X_val)
valPadded = pad_sequences(valSequences, maxlen=maxLength, truncating=truncType)

In [None]:
valPadded.shape

## Testing data

In [None]:
dsTest = pd.read_csv(os.path.join(pathData, 'test.csv.zip'))
print('Shape:', dsTest.shape)
dsTest.head()

In [None]:
dsTest.info()

In [None]:
blanks = []
for index, id, text in dsTest[['id', 'comment_text']].itertuples():
  newText = str(text)
  if newText.isspace():
    blanks.append(index)
print(f'Number of observations without text: {len(blanks)}')

### Cleaning

In [None]:
dsTest['comment_text'] = dsTest['comment_text'].map(customCleaning)

In [None]:
dsTest.head(20)

### Transforming

In [None]:
testSequences = tokenizer.texts_to_sequences(dsTest['comment_text'].values)
testPadded = pad_sequences(testSequences, maxlen=maxLength, truncating=truncType)

In [None]:
testPadded.shape

# Modeling
In this case, I will use a basic model based on tensorflow tutorial.

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocabSize, embeddingDim),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(6, activation='sigmoid')
])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(trainPadded, y_train, epochs=10, batch_size=32, validation_data=(valPadded, y_val), callbacks=[tf.keras.callbacks.EarlyStopping(monitor = 'val_loss')])

## Predict

Showing submission file example.

In [None]:
dsSampleSubmission = pd.read_csv(os.path.join(pathData, 'sample_submission.csv.zip'))
dsSampleSubmission.head()

Predicting

In [None]:
predicted = model.predict(testPadded)
print('Shape:', predicted.shape)

In [None]:
predicted = np.round(predicted, 1)

In [None]:
dsPredicted = pd.DataFrame(predicted, columns=['toxic', 'severe_toxic',	'obscene', 'threat', 'insult', 'identity_hate'])
dsSubmission = pd.concat([dsTest['id'], dsPredicted], axis=1)
dsSubmission.head()

In [None]:
dsSubmission.to_csv('submission.csv', index=False)

# References
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/overview

https://towardsdatascience.com/multi-class-text-classification-with-lstm-using-tensorflow-2-0-d88627c10a35

https://www.jeansnyman.com/posts/multi-class-text-classification-with-tensorflow/