0. Install Dependencies and Bring Data

In [2]:
!pip install tensorflow pandas matplotlib sklearn



In [3]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [4]:
df = pd.read_csv(
        os.path.join('jigsaw-toxic-comment-classification-challenge', 'train.csv', 'train.csv')
        )

In [21]:
df.iloc[2]['comment_text']

"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."

In [22]:
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [26]:
df[df.columns[2:]].iloc[10]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 10, dtype: int64

In [31]:
df[df['toxic'] == 1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


1. Preprocess

In [33]:
from tensorflow.keras.layers import TextVectorization as tv

In [39]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [35]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [37]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [38]:
df[df.columns[2:]].values

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [40]:
X

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [41]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [44]:
MAX_WORDS = 200000 # number of words in the vocab
MAX_COMMENT_LENGTH = 1800 # number of chars in comment

In [45]:
vectorizer = tv(max_tokens = MAX_WORDS,
                output_sequence_length = MAX_COMMENT_LENGTH,
                output_mode = 'int'
               )

In [48]:
type(X)

pandas.core.series.Series

In [51]:
type(X.values) # we want numpy array

numpy.ndarray

In [49]:
vectorizer.adapt(X.values)

In [53]:
# vectorizer.get_vocabulary()

In [54]:
vectorizer('Hello world, life is beautiful')

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([288, 263, 306, ...,   0,   0,   0], dtype=int64)>

In [55]:
vectorizer('Hello world, life is beautiful')[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 288,  263,  306,    9, 2977], dtype=int64)>

In [56]:
vectorized_text = vectorizer(X.values)

In [57]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [58]:
len(X)

159571

In [59]:
# MCSHBAP - map, cache, shuffle, batch, prefetch from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [61]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [62]:
batch_X.shape

(16, 1800)

In [63]:
batch_y.shape

(16, 6)

In [64]:
train = dataset.take(int(len(dataset)*.7)) # no. of batches
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [68]:
train_generator = train.as_numpy_iterator()

In [71]:
train_generator.next()

(array([[   6,  361, 6002, ...,    0,    0,    0],
        [  79,   23,    2, ...,    0,    0,    0],
        [1721, 5158,  660, ...,    0,    0,    0],
        ...,
        [  23,    7,   55, ...,    0,    0,    0],
        [  39,  140, 6067, ...,    0,    0,    0],
        [   6,  526,   32, ...,    0,    0,    0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

2. Create Sequential Model

In [72]:
from tensorflow.keras.models import Sequential as sq
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense

In [73]:
model = sq()
# Create the embedding layer
model.add(Embedding(MAX_WORDS+1, 32))
# 
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(6, activation = 'sigmoid'))

NameError: name 'Embedding' is not defined