In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration

In [None]:
train = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
train.info()

In [None]:
test.info()

## Target Variable Exploration

In [None]:
import matplotlib.pyplot as plt
plt.hist(train['target'])

## Text Cleaning and Preprocessing

In [None]:
!pip install BeautifulSoup4

## Sampling from the data

In [None]:
train_sample = train.sample(frac = 0.1, random_state = 42, axis = 'index')
train_sample.info()

## Text Cleaning

In [None]:
from bs4 import BeautifulSoup # Text Cleaning
import re, string # Regular Expressions, String
from nltk.corpus import stopwords # stopwords
from nltk.stem.porter import PorterStemmer # for word stemming
from nltk.stem import WordNetLemmatizer # for word lemmatization
import unicodedata
import html

# set of stopwords to be removed from text
stop = set(stopwords.words('english'))

# update stopwords to have punctuation too
stop.update(list(string.punctuation))

def clean_text(text):
    
    # Remove unwanted html characters
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
    'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
    '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
    ' @-@ ', '-').replace('\\', ' \\ ')
    text = re1.sub(' ', html.unescape(x1))
    
    # remove non-ascii characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
#     # strip html
#     soup = BeautifulSoup(text, 'html.parser')
#     text = soup.get_text()
    
    # remove between square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # remove twitter tags
    text = text.replace("@", "")
    
    # remove hashtags
    text = text.replace("#", "")
    
    # remove all non-alphabetic characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    # remove stopwords from text
    final_text = []
    for word in text.split():
        if word.strip().lower() not in stop:
            final_text.append(word.strip().lower())
    
    text = " ".join(final_text)
    
    # lemmatize words
    lemmatizer = WordNetLemmatizer()    
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = " ".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])
    
    # replace all numbers with "num"
    text = re.sub("\d", "num", text)
    
    return text.lower()

In [None]:
train_sample['clean_comment_text'] = train_sample['comment_text'].apply(clean_text)
test['clean_comment_text'] = test['comment_text'].apply(clean_text)

In [None]:
train_sample['clean_comment_text'].head(5)

## Sequence Length Analysis

In [None]:
plt.hist(list(train_sample['clean_comment_text'].str.split().map(lambda x: len(x))))

In [None]:
np.median(np.array(train_sample['clean_comment_text'].str.split().map(lambda x: len(x))))

# Text Representation

## Loading the embedding matrix

In [None]:
embedding_dict = pd.read_pickle('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl')

## Text Tokenization as Sequences

In [None]:
# Sequences creation, truncation and padding

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Setting up the tokenizer
vocab_size = 10000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')
tokenizer.fit_on_texts(list(train_sample['clean_comment_text']) + list(test['comment_text']))

max_len = 18
X_train_seq = tokenizer.texts_to_sequences(train_sample['clean_comment_text'])
X_test_seq = tokenizer.texts_to_sequences(test['comment_text'])

X_train_seq = pad_sequences(X_train_seq, maxlen = max_len, truncating = 'post', padding = 'post')
X_test_seq = pad_sequences(X_test_seq, maxlen = max_len, truncating = 'post', padding = 'post')
y_train = np.array(train_sample['target']).astype(int)

print(f"X_train shape: {X_train_seq.shape}")
print(f"X_test shape: {X_test_seq.shape}")
print(f"y_train shape: {y_train.shape}")

## Train Validation Split

In [None]:
from sklearn.model_selection import train_test_split

X_train_seq, X_val_seq, y_train, y_val = train_test_split(X_train_seq, y_train, test_size = 0.2, random_state = 42)

print(f"X_train shape: {X_train_seq.shape}")
print(f"X_val shape: {X_val_seq.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

In [None]:
num_words = len(tokenizer.word_index)
print(f"Number of unique words: {num_words}")

In [None]:
# Applying GloVE representations on our corpus

embedding_matrix=np.zeros((num_words,300))

for word,i in tokenizer.word_index.items():
    if i < num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec    
            
embedding_matrix.shape

# Modeling

## LSTM

In [None]:
# Setting up the model

from keras import layers
from keras.models import Sequential

def setup_lstm_model(max_len, n_latent_factors):
    
    model = Sequential()
    model.add(layers.Embedding(num_words, n_latent_factors, weights = [embedding_matrix], 
                               input_length = max_len, trainable = False))
    model.add(layers.LSTM(units = max_len, return_sequences = True))
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Dense(units = 1, activation = 'sigmoid'))
    
    return model

In [None]:
lstm_model = setup_lstm_model(max_len = max_len, n_latent_factors = 300)
lstm_model.summary()

In [None]:
# Final hyperparameter configurations
lstm_model.compile(optimizer = 'adam', loss = 'binary_crossentropy')

batch_size = 512
epochs = 5

lstm_model.fit(X_train_seq, y_train, epochs=epochs, batch_size = batch_size, validation_data = (X_val_seq, y_val))

## GRU

In [None]:
def setup_gru_model(max_len, n_latent_factors):
    
    model = Sequential()
    model.add(layers.Embedding(num_words, n_latent_factors, weights = [embedding_matrix], 
                               input_length = max_len, trainable = False))
    model.add(layers.GRU(units = max_len, return_sequences = True))
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Dense(units = 1, activation = 'sigmoid'))
    
    return model

In [None]:
gru_model = setup_gru_model(max_len = max_len, n_latent_factors = 300)
gru_model.summary()

In [None]:
# Final hyperparameter configurations and fitting
gru_model.compile(optimizer = 'adam', loss = 'binary_crossentropy')

batch_size = 512
epochs = 5

gru_model.fit(X_train_seq, y_train, epochs=epochs, batch_size = batch_size, validation_data = (X_val_seq, y_val))