<a href="https://colab.research.google.com/github/tohyongyao/AI-Project/blob/master/TYY_TF2_Toxic_Words_Text_Classification_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Toxic words text classifier

In [None]:
%tensorflow_version 2.x

In [None]:
!wget -qq https://www.dropbox.com/s/fz2d61pwgigtra7/toxic_words.zip
!unzip toxic_words.zip

!ls

Archive:  toxic_words.zip
  inflating: test_labels.csv         
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
sample_data	       test.csv		toxic_words.zip
sample_submission.csv  test_labels.csv	train.csv


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")

import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Input, Dropout

In [None]:
train_df = pd.read_csv("./train.csv").fillna("blank")
test_df = pd.read_csv("./test.csv").fillna("blank")

In [None]:
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 30
BATCH_SIZE = 64

In [None]:
x_train = train_df["comment_text"].values
y_train = train_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
x_test = test_df["comment_text"].values

Tokenize the words

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(list(x_train) + list(x_test))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)


Padding the sequence

In [None]:
x_train = sequence.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH,padding="post")
x_test = sequence.pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH,padding="post")

In [None]:
print(x_train.shape)
print(len(x_test))

(159571, 100)
153164


Embedding

In [None]:
Inp = Input((MAX_SEQUENCE_LENGTH,))

embedding = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, embeddings_initializer="uniform")(Inp)
x = GlobalMaxPooling1D()(embedding)
x = Dropout(0.2)(x)

out = Dense(6, activation='sigmoid')(x) #use sigmoid for multiple class

Model

In [None]:
model = Model(inputs=Inp, outputs=out)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 30)           600000    
_________________________________________________________________
global_max_pooling1d (Global (None, 30)                0         
_________________________________________________________________
dropout (Dropout)            (None, 30)                0         
_________________________________________________________________
dense (Dense)                (None, 6)                 186       
Total params: 600,186
Trainable params: 600,186
Non-trainable params: 0
_________________________________________________________________


In [None]:
print(x_train.shape)
print(y_train.shape)

(159571, 100)
(159571, 6)


In [None]:
hist = model.fit(x_train,
                 y_train,
                 batch_size=BATCH_SIZE,
                 epochs=5,
                 validation_split=0.1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
