In [None]:
import numpy as np
import pandas as pd
import string
import os
import seaborn as sns
import matplotlib.pyplot as plt
import json

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from wordcloud import STOPWORDS

from sklearn.model_selection import train_test_split,KFold

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../input/toxic-comments-train/train.csv')
test = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
sample = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
target = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
train['y'] = train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].sum(axis=1) > 0
train.drop(['toxic','severe_toxic','obscene','threat','insult','identity_hate'], inplace=True, axis=1)

In [None]:
count_of_toxic_comments =  train[train.y != 0].shape[0]
count_of_toxic_comments

In [None]:
train_toxic = train[train.y != 0]
train_non_toxic = train[train.y == 0].sample(count_of_toxic_comments)

In [None]:
df = pd.concat([train_toxic, train_non_toxic])
df

In [None]:
df.y.value_counts().plot(kind='barh')

#### Imbalanced dataset issue sorted

In [None]:
# word_count
df['word_count'] = df['comment_text'].apply(lambda x: len(str(x).split()))

# unique_word_count
df['unique_word_count'] = df['comment_text'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
df['stop_word_count'] = df['comment_text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# mean_word_length
df['mean_word_length'] = df['comment_text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# char_count
df['char_count'] = df['comment_text'].apply(lambda x: len(str(x)))

# punctuation_count
df['punctuation_count'] = df['comment_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

In [None]:
df.head()

In [None]:
df.describe()

### 1. Remove stopwords, Punctuations

In [None]:
# Remove stopwords & convert to lower case
df['comment_text'] = df['comment_text'].apply(lambda x: ' '.join([w for w in str(x).lower().split() if w not in STOPWORDS]))

# Remove Punctuations
df["comment_text"] = df['comment_text'].str.replace('[^\w\s]','')
df.tail()

In [None]:
df = df.reset_index(drop=True)
kflod = KFold(n_splits=5, shuffle=True, random_state=22)
for fold, ( _, val_) in enumerate(kflod.split(X=df)):
    df.loc[val_ , "kfold"] = int(fold)

df["kfold"] = df["kfold"].astype(int)

In [None]:
for fold in range(5):
    model_save_path = './lstm_{}'.format(fold)
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_test = df[df.kfold == fold].reset_index(drop=True)
    X_train, X_test, y_train, y_test = df_train.drop(['y', 'kfold'], axis=1),\
                                        df_test.drop(['y', 'kfold'], axis=1),\
                                        df_train['y'],\
                                        df_test['y']
    X_train = X_train.comment_text.values
    X_test = X_test.comment_text.values
    OOV_TOKEN = '<OOV>'
    VOCAB_SIZE = 10000
    MAX_LEN = 100
    EMBEDDING_DIM = 100
    tokenizer = Tokenizer(
    num_words=VOCAB_SIZE,
    oov_token=OOV_TOKEN)
    tokenizer.fit_on_texts(X_train)
    tokenizer_json = tokenizer.to_json()
    with open(model_save_path+'/tokenizer.json'.format(fold), 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    train_seq = tokenizer.texts_to_sequences(X_train)
    train_padded = pad_sequences(
    train_seq, maxlen=MAX_LEN, dtype='int32', padding='post',
    truncating='post')

    test_seq = tokenizer.texts_to_sequences(X_test)
    test_padded = pad_sequences(
        test_seq, maxlen=MAX_LEN, dtype='int32', padding='post',
        truncating='post')
    
    model = tf.keras.Sequential([
        Embedding(VOCAB_SIZE, EMBEDDING_DIM, name="embedding"),
        LSTM(64),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dropout(0.2),
        Dense(1,activation='sigmoid')
    ])
    model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    es = EarlyStopping(patience=3, 
                   monitor='loss', 
                   restore_best_weights=True, 
                   mode='min', 
                   verbose=1)
    
    hist = model.fit(
    train_padded,
    y = y_train,
    validation_data=(test_padded, y_test),
    epochs=15,
    callbacks=es)
    model.save(model_save_path)
    

### Prepare test data

In [None]:
target.head()

In [None]:
df_target = target

In [None]:
# Remove stopwords & convert to lower case
df_target['text'] = df_target['text'].apply(lambda x: ' '.join([w for w in str(x).lower().split() if w not in STOPWORDS]))

# Remove Punctuations
df_target["text"] = df_target['text'].str.replace('[^\w\s]','')
df_target.head()

In [None]:
target_seq = tokenizer.texts_to_sequences(df_target.text.values)
target_padded = pad_sequences(
    target_seq, maxlen=MAX_LEN, dtype='int32', padding='post',
    truncating='post'
)