In [None]:
# A dependency of the preprocessing for BERT inputs
!pip install -q tensorflow-text

#!pip install -q tf-models-official

In [None]:
import os
import shutil
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
#import tensorflow_text as text
#from official.nlp import optimization  # to create AdamW optmizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

import nltk
from nltk.tokenize import  word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.util import ngrams
import re

from sklearn.model_selection import train_test_split

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42


PATH='../input/jigsaw-toxic-comment-classification-challenge'


raw_train_ds = pd.read_csv(PATH+'/train.csv.zip').sample(frac=1)
raw_test=pd.read_csv(PATH+'/test.csv.zip')
raw_train_ds.head()

In [None]:
for line in raw_train_ds['comment_text'][0:10]:
    print(line)

In [None]:
stopWords = stopwords.words('english')
raw_train_ds['comment_token'] =raw_train_ds['comment_text'].map(lambda x: word_tokenize(x)) 
raw_train_ds['comment_token_stop']=raw_train_ds['comment_token'].apply(lambda x: [item for item in x if item not in stopWords])

lemmatizer = WordNetLemmatizer() 
raw_train_ds['comment_token_lemm']=raw_train_ds['comment_token_stop'].apply(lambda x: [lemmatizer.lemmatize(item) for item in x ])

In [None]:
raw_test['comment_token'] =raw_test['comment_text'].map(lambda x: word_tokenize(x)) 
raw_test['comment_token_stop']=raw_test['comment_token'].apply(lambda x: [item for item in x if item not in stopWords])


raw_test['comment_token_lemm']=raw_test['comment_token_stop'].apply(lambda x: [lemmatizer.lemmatize(item) for item in x ])

In [None]:
import string
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
# Applying the clean_text on train set

raw_train_ds['comment_text_reg'] = raw_train_ds['comment_token_lemm'].apply(lambda x: [clean_text(item) for item in x])

In [None]:
raw_test['comment_text_reg'] = raw_test['comment_token_lemm'].apply(lambda x: [clean_text(item) for item in x])

In [None]:
train_x=raw_train_ds['comment_text_reg'][:round(len(raw_train_ds['comment_text_reg'])*.77)].reset_index()
test_x=raw_train_ds['comment_text_reg'][round(len(raw_train_ds['comment_text_reg'])*.77):].reset_index()

train_y=raw_train_ds[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].iloc[:round(len(raw_train_ds['comment_text_reg'])*.77),:].reset_index()
test_y=raw_train_ds[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].iloc[round(len(raw_train_ds['comment_text_reg'])*.77):,:].reset_index()

In [None]:
train_input=tf.data.Dataset.from_tensor_slices(train_x['comment_text_reg'].apply(lambda x:" ".join(x) ))
train_target=tf.data.Dataset.from_tensor_slices(train_y[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].apply(lambda x: x ))

test_input=tf.data.Dataset.from_tensor_slices(test_x['comment_text_reg'].apply(lambda x:" ".join(x) ))
test_target=tf.data.Dataset.from_tensor_slices(test_y[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].apply(lambda x: x ))

In [None]:
real_test=raw_test['comment_text_reg'].apply(lambda x:" ".join(x) )

In [None]:
real_test

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64
train=tf.data.Dataset.zip((train_input, train_target)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test=tf.data.Dataset.zip((test_input, test_target)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in train.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

In [None]:
VOCAB_SIZE=1000

encoder =tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE, 
    ngrams=2,
    )
encoder.adapt(train.map(lambda text, label: text))

In [None]:
import numpy as np
vocab = np.array(encoder.get_vocabulary())
vocab

In [None]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

In [None]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6)
])

In [None]:
print([layer.supports_masking for layer in model.layers])

In [None]:
# predict on a sample text without padding.

sample_text = ('You fucking arrogent')
predictions = model.predict(np.array([sample_text]))
print(predictions)

In [None]:
# predict on a sample text with padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(train, epochs=10,
                    validation_data=test, 
                    validation_steps=30)

In [None]:
raw_test['id']

In [None]:
test_labels['id']

In [None]:
sampl=pd.read_csv(PATH+'/sample_submission.csv.zip')
sampl

In [None]:
sampl.columns[1:]

In [None]:
real_test[0:10]

In [None]:
pred=tf.nn.sigmoid(model.predict(real_test))

In [None]:
pred

In [None]:
sampl[sampl.columns[1:]]=tf.squeeze(pred)

In [None]:
sampl.isna().sum()

In [None]:
sampl.to_csv('./sample_submission.csv', index=False)