# Jigsaw Multilingual Toxic Comment Classification
#### Members: 邱斯、陳則明、楊淳安、黃亦晨

In [None]:
%matplotlib inline

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import GRU,SimpleRNN,LSTM
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

import sys
import os
import numpy as np
import pandas as pd
import IPython
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [None]:
valid = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation-processed-seqlen128.csv")
train = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv")
test = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv")
submit = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv")
train = train[['id', 'comment_text', 'input_word_ids', 'input_mask','all_segment_id', 'toxic']].iloc[:20000]

In [None]:
train.info()

In [None]:
train_distribution = train["toxic"].value_counts().values
valid_distribution = valid["toxic"].value_counts().values

non_toxic = [train_distribution[0] / sum(train_distribution) * 100, valid_distribution[0] / sum(valid_distribution) * 100]
toxic = [train_distribution[1] / sum(train_distribution) * 100, valid_distribution[1] / sum(valid_distribution) * 100]

plt.figure(figsize=(9,6))
plt.bar([0, 1], non_toxic, alpha=.4, color="r", width=0.35, label="non-toxic")
plt.bar([0.4, 1.4], toxic, alpha=.4, width=0.35, label="toxic")
plt.xlabel("Dataset")
plt.ylabel("Percentage")
plt.xticks([0.2, 1.2], ["train", "valid"])
plt.legend(loc="upper right")

plt.show()

In [None]:
print(f"train: \nnon-toxic rate: {train_distribution[0] / sum(train_distribution) * 100: .2f} %\ntoxic rate: {train_distribution[1] / sum(train_distribution) * 100: .2f} %")
print(f"valid: \nnon-toxic rate: {valid_distribution[0] / sum(valid_distribution) * 100: .2f} %\ntoxic rate: {valid_distribution[1] / sum(valid_distribution) * 100: .2f} %")

In [None]:
lang = valid["lang"].value_counts()

plt.figure(figsize=(9, 6))
plt.xlabel("Lang")
plt.ylabel("Num")
plt.xticks([0.2, 0.6, 1], ["tr", "es", "it"])
plt.bar([0.2, 0.6, 1], lang, color="purple", width=0.28, alpha=.4)
plt.show()

# TPU

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

# 讀入資料
1. 從競賽中取出未經預處理的訓練資料 `jigsaw-toxic-comment-train.csv`
2. 取出驗證資料 `validation.csv` 和測試資料 `test.csv`
3. 丟棄不需要的欄位，保留 `[id, comment_text, toxic]`，comment_text 代表 twitter 的留言，toxic 是 1 表示惡意，0 表示安全

In [None]:
train = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
validation = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

train.drop(['severe_toxic','obscene','threat','insult','identity_hate'], axis=1, inplace=True)

1. 從原始 223549 筆訓練資料中讀取出 25000 筆做為我們的訓練資料

從中我們找到最大長度的句子是 - 1403

In [None]:
train = train.loc[:25000-1,:]
print(f"訓練資料總數：{train.shape[0]}\n句子最大長度：{train['comment_text'].apply(lambda x:len(str(x).split())).max()}")

定義本次競賽中作為評估的函式 **roc_auc**，以下為所需的兩個輸入：
- model 預測出來的 predictions
- 資料中的標準答案

In [None]:
def roc_auc(predictions,target):
  
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    
    return roc_auc

將 25000 筆資料分割為 `train` 跟 `valid` (0.2)

$ 25000\times0.2=5000 $ (Validation data)

$ 25000-5000=20000 $ (Training data)

之後取出測試資料的 twitter 給 `xtset`

In [None]:
#取出train和valid所需資料
#將train中不必要的欄位drop掉
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, 
                                                  stratify=train.toxic.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

# Tokenizer
1. 將每一筆資料的評論部分做斷詞，得到 token sequences 
2. 將每筆資料調整成同一大小 (128字) --> zero padding
3. 最後 3 個資料集 (train, valid, test) 的長度分別是 (20000, 5000, 63812)，每個句子因為 padding，所以都是 128
4. word_index 紀錄著每一個字對應的 id

In [None]:
token = text.Tokenizer(num_words=None)
max_len = 128

token.fit_on_texts(list(xtrain)+ list(xvalid)) #+ list(xvalid)+list(test)
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index
print(xtrain_pad.shape, xvalid_pad.shape)


# Word Embedding
1. 載入 GloVe 的詞量中，將之整理成一個 dict `embeddings_index`
2. 每一行的第一個字是一個單字，剩下是他對應的向量
3. 我們最後可以看到整個 GloVe 有 2196017 個單字存在

In [None]:
embeddings_index = {}
f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# LSTM
1. 將第一層設定為 300 維的 Embedding layer
2. 三層 LSTM 層，各 300 個神經元
3. 設定 drop out，避免 overfitting
3. 將輸出使用 sigmoid 拉到 0~1 之間
4. Compiling，因為是二分類別所以使用 binary_crossentropy 當 loss function，Adam 當我們 optimization

In [None]:
%%time
with strategy.scope():
    
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

    model.add(LSTM(300, dropout=0.3, recurrent_dropout=0.3,return_sequences=True))
    model.add(LSTM(300, dropout=0.3, recurrent_dropout=0.3,return_sequences=True))
    model.add(LSTM(300, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.summary()

# Train

In [None]:
%%time

model.fit(xtrain_pad, ytrain, epochs=5, batch_size=48*strategy.num_replicas_in_sync)

# LSTM模型在 valid 的表現 

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

# GRU
1. 將第一層設定為 300 維的 Embedding layer
2. 三層 GRU 層，各 300 個神經元
3. 設定 drop out，避免 overfitting
4. 將輸出使用 sigmoid 拉到 0~1 之間
5. Compiling，因為是二分類別所以使用 binary_crossentropy 當 loss function，Adam 當我們 optimization

In [None]:
%%time
with strategy.scope():
    # GRU with glove embeddings and two dense layers
     model = Sequential()
     model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
     model.add(SpatialDropout1D(0.3))
     model.add(GRU(300,return_sequences=True))
     model.add(Dropout(0.3))
     model.add(GRU(300,return_sequences=True))
     model.add(Dropout(0.3))
     model.add(GRU(300,return_sequences=False))
     model.add(Dropout(0.3))
     model.add(Dense(1, activation='sigmoid'))

     model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    
model.summary()

# Train

In [None]:
#GRU模型訓練
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

# GRU模型在 valid 的表現

In [None]:
#使用valid測試資料的AUC結果
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))