In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import nltk
import time

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Activation, Dropout
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jub-train-processed/processed_train.csv
/kaggle/input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl
/kaggle/input/jub-text-processed/processed_test.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/all_data.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test_public_expanded.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test_private_expanded.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/toxicity_individual_annotations.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/train.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/identity_individual_annotations.csv
/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test.csv


In [2]:
MAX_LEN = 100

In [3]:
train = pd.read_csv('../input/jub-train-processed/processed_train.csv', encoding='utf8')
test = pd.read_csv('../input/jub-text-processed/processed_test.csv', encoding='utf8')

In [4]:
train_X = train.text.astype(str)
train_y = train.label

In [5]:
X_test = test.text.astype(str)

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(X_test) + list(train_X))

In [7]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

590148

In [8]:
train_X = tokenizer.texts_to_sequences(train_X)
X_test = tokenizer.texts_to_sequences(X_test)

In [9]:
train_X = pad_sequences(train_X, maxlen=MAX_LEN, truncating='post', padding='post')
X_test = pad_sequences(X_test, maxlen=MAX_LEN, truncating='post', padding='post')

In [10]:
embedding_dict = pd.read_pickle('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl')

In [11]:
word_vector_matrix = np.zeros((vocab_size, 300))

for word, index in tokenizer.word_index.items():
    vector = embedding_dict.get(word)
    if vector is not None:
        word_vector_matrix[index] = vector

In [12]:
import gc
del train
gc.collect()

97

In [13]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, random_state = 41, test_size = 0.2)

In [14]:
batch_size = 512
epochs = 5
vec_size = 300

In [15]:
%%time

with tf.device('/device:GPU:0'):

    model = Sequential()
    model.add(Embedding(
        vocab_size, 
        vec_size, 
        input_length=MAX_LEN, 
        weights=[word_vector_matrix], 
        trainable=False
    ))

    model.add(Bidirectional(LSTM(units=128, return_sequences = True)))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(units = 1, activation = 'sigmoid'))
    model.compile(optimizer=Adam(learning_rate = 0.001), loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 12min 44s, sys: 32 s, total: 13min 16s
Wall time: 15min 20s


In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 300)          177044400 
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 256)          439296    
_________________________________________________________________
global_average_pooling1d (Gl (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 177,483,953
Trainable params: 439,553
Non-trainable params: 177,044,400
_________________________________________________________________


In [17]:
# preds = model.predict(X_test, batch_size=2048)

In [18]:
# preds.dtype

In [19]:
submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv', index_col='id')

In [20]:
submission['prediction'] = model.predict(X_test, batch_size=2048)

In [21]:
submission.head()

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
7097320,0.005821
7097321,0.019087
7097322,0.040998
7097323,0.002884
7097324,0.001476


In [22]:
submission.to_csv('submission.csv')