In [19]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, Dropout, LSTM, SimpleRNN, GRU
from tensorflow.keras.optimizers import RMSprop
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
train_csv = 'C:/Users/Kowshik Rayani/Downloads/train.csv/train.csv'
train_df = pd.read_csv(train_csv)


#### Training Data Preperation
* read the labels and convert into one-class labels
* we will focus on 2 class problem: toxic and non toxic comments
* we will label all different types of toxic comments into same category of toxic label:
    * 0 for toxic comment
    * 1 for non-toxic comments
* later we can explore how to make it multiclass classifier

In [20]:
# each toxic class is labelled as 1
toxic_row_sums = train_df.iloc[:,2:].sum(axis=1)
# if sum of toxic class is 0 then it is a clean comment
train_df['clean'] = (toxic_row_sums==0)
# Input Data
train_texts = train_df['comment_text']
# Output Label
train_labels = train_df['clean']

### Pre-processing : Tokenization
Now we have training data in two separate dataframe columns (arrays/list): an ordered array consisting of comments (input for the network) and another array consisting of class lables in same order (output of the network).

We have to transform this data into network input format and output format. This step is called pre-processing.  
Steps of pre-processing:

1. Tokenize the text into words
2. Assign each word a dimension


To accompolish step 1 and 2 we will use inbuilt __Tokenizer__ class

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer

max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(train_texts)
sequences = tokenizer.texts_to_sequences(train_texts)
print(sequences[0])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

[688, 75, 1, 126, 130, 177, 29, 672, 4511, 1116, 86, 331, 51, 2278, 50, 6864, 15, 60, 2756, 148, 7, 2937, 34, 117, 1221, 2825, 4, 45, 59, 244, 1, 365, 31, 1, 38, 27, 143, 73, 3462, 89, 3085, 4583, 2273, 985]
Found 210337 unique tokens.


### Batching and Padding for Embedding
Now once we have the tokens and each token(word) has a dimension assigned to it, we will do following steps to create word embeddings  

3. use this dimension assignments to define embedding for individual word
4. use word embedding to create word vector for a comment


We will use a specific type of network layer for this, which is called __Embedding Layer__. The above generated tokens (sequence of number) will go as input to Embedding layer, which will output word embeddings as output to next layer.  

Input and Output of Neural Network are done is batches. A batch is a group of input data which are fed together to the network. As the network can process individual data element in parallel, the training will be faster.

In case of Embedding Layer, Inpupt and Output in a batch can be seen as follows:  

   **Input**: 2D tensor of integers, of shape (# seq. samples in particular batch, sequence_length), where each entry is a sequence of integers (output of above code).  
   **Output**: 3D floating-point tensor of shape (# seq. samples in particula patch, sequence_length, embedding_dimensionality).  

Sequence length can be variable per batch. But in a single batch sequence length will be same for all sequences.  

So from data we have to create batches of sequence of similar length and to do that we have to pad or truncate each sequence to have same sequence length. And we can use each batch as a training input for embedding layer.  

For sample case: we take 10k sequence from 160k for training in a single batch. And take max sequence length of 20 words.


In [22]:
from tensorflow.keras import preprocessing
training_sequences = sequences[:10000]
training_labels = train_labels[:10000]
seq_max_len = 20
# training padded sequences
train_seq_pad = preprocessing.sequence.pad_sequences(sequences=training_sequences, maxlen=seq_max_len)

# testing padded sequences
testing_sequences = sequences[10000:11000]
testing_labels = train_labels[10000:11000]
test_seq_pad = preprocessing.sequence.pad_sequences(sequences=testing_sequences, maxlen=seq_max_len)



### Model 3

In [23]:

import keras
from keras import layers
embedding_dim = 16
model_3 = Sequential()
model_3.add(Embedding(10000, embedding_dim, input_length=seq_max_len))
model_3.add(LSTM(32))
model_3.add(Dropout(0.5))
model_3.add(Dense(1, activation='sigmoid'))  
model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [24]:
model_3.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 16)            160000    
                                                                 
 lstm_1 (LSTM)               (None, 32)                6272      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 166305 (649.63 KB)
Trainable params: 166305 (649.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#### Train model 3

In [25]:
import numpy as np
history_3 = model_3.fit(train_seq_pad, np.asarray(training_labels), epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Testing model 3

In [32]:
print(model_3.metrics_names)
evaluation_results = model_3.evaluate(x=test_seq_pad, y=np.asarray(testing_labels))
print(f"Test Loss: {evaluation_results[0]}")
print(f"Test Accuracy: {evaluation_results[1]*100} %")

['loss', 'accuracy']
Test Loss: 0.4467005431652069
Test Accuracy: 93.19999814033508 %


In [None]:
model_3.save('toxic.h5')

  saving_api.save_model(


In [None]:
from tensorflow.keras.models import load_model
model1 = load_model('toxic.h5')


In [None]:
#test_texts = ["D'aww! He matches this background colour I'm seemingly stuck with. Thanks."]
test_texts = ["kohli is bad boy "]
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_seq_pad = preprocessing.sequence.pad_sequences(sequences=test_sequences, maxlen=seq_max_len)

#predictions = model1.predict(test_seq_pad)
print(test_texts)

['kohli is bad boy ']


In [None]:
predictions = model1.predict(test_seq_pad, batch_size=1)




In [None]:
threshold = 0.5
binary_predictions = (predictions > threshold).astype(int)

In [None]:
print(binary_predictions)

[[1]]


In [None]:
num_non_toxic = np.sum(binary_predictions == 1)
num_toxic = np.sum(binary_predictions == 0)
total_comments = len(binary_predictions)

non_toxic_percentage = (num_non_toxic / total_comments) * 100
toxic_percentage = (num_toxic / total_comments) * 100

print(f"Non-Toxic Comments: {non_toxic_percentage}%")
print(f"Toxic Comments: {toxic_percentage}%")


Non-Toxic Comments: 100.0%
Toxic Comments: 0.0%
