In [None]:
import pandas as pd
import numpy as np
import re
import csv
import tensorflow as tf
import nltk
import gc
from gensim.models import Word2Vec
from keras.preprocessing import text, sequence
from sklearn.model_selection import train_test_split
from collections import Counter

In [None]:
#PREPROCESSING
#This dataset is from Kaggle Competition, Toxic Comment Classification Challenge, 
#that train dataset contains 159571 rows and 8 columns, which are id, comment_text, 
#toxic, sever_toxic, obscene, threat, insult and identity_hate.
#The test dataset has over 150000 records.

In [None]:
df_train = pd.read_csv('train.csv') 
df_test = pd.read_csv('test.csv')
train_input = df_train['comment_text']
test_input = df_test['comment_text']

In [None]:
# Define a function to read the FastText Pre-trained Word Embedding in to a dictionary.
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open('./wiki.en.vec'))
del embeddings_index['2519370'] 
# The first row of the file is useless, so delete it.

In [None]:
len(embeddings_index) 
#FastText Word Embedding file contains 2500000 words including punctuations.
#It doesn't contains 0-9 and words like I'm, can't and etc.

In [None]:
max_features = 100000
maxlen = 150 
#Set the max length of each comment. If it is longer than 150 then cut if off,
#if it is shorter than 150 then pad it up to 150.
#This max length can be choosen in different ways. 
#Here it is a number that near 80 percentile of all comment length in training dataset.

In [None]:
# Define data cleaning function
def clean(string):
    string = re.sub(r'\n', ' ', string)
    string = re.sub(r'\t', ' ', string)
    string = re.sub("[^A-Za-z\(\)\,\.\?\'\!]", " ", string)
    string = re.sub("\'m", ' am ', string)
    string = re.sub("\'s", ' is ', string)
    string = re.sub("can\'t", 'cannot ', string)
    string = re.sub("n\'t", ' not ', string)
    string = re.sub("\'ve", ' have ', string)
    string = re.sub("\'re", ' are ', string)
    string = re.sub("\'d", " would ", string)
    string = re.sub("\'ll", " will ", string)
    string = re.sub("\,", " , ", string)
    string = re.sub("\'", " ' ", string)
    string = re.sub("\.", " . ", string)
    string = re.sub("\!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r'\s{2,}', ' ', string.lower())
    return string

In [None]:
x_train = train_input.apply(clean)
y_train = df_train[['toxic','severe_toxic',"obscene", "threat", "insult", "identity_hate"]]
x_test = test_input.apply(clean)

In [None]:
#After data clean there might be some record have nothing in comment_text, fill with a word.
x_train = x_train.fillna('fillna')
x_test = x_test.fillna('fillna')

In [None]:
#Create the dictionary whose keys contains all words in train dataset that also shown 
#in FastText word embeddings.
lst = []
for line in x_train:
    lst += line.split()
    
count = Counter(lst)
for k in list(count.keys()):
    if k not in embeddings_index:
        del count[k]

In [None]:
len(count)

In [None]:
count = dict(sorted(count.items(), key=lambda x: -x[1]))

In [None]:
count = {k:v for (k,v) in count.items() if v >= 2}

In [None]:
len(count)

In [None]:
count = dict(zip(list(count.keys()),range(1,79101 + 1)))

In [None]:
embedding_matrix = {}
for key in count:
    embedding_matrix[key] = embeddings_index[key]

In [None]:
#Create teh word embedding matrix where the first element is all zeros which is for word
#that is not shown and the padding elements.
W = np.zeros((1,300))
W = np.append(W, np.array(list(embedding_matrix.values())),axis=0)
W = W.astype(np.float32, copy=False)

In [None]:
W.shape

In [None]:
#Same Step for text dataset.
lst = []
for line in x_test:
    lst += line.split()
    
count_test = Counter(lst)
for k in list(count_test.keys()):
    if k not in embedding_matrix:
        del count_test[k]
    else:
        count_test[k] = count[k]

In [None]:
#Release memory.
del lst
gc.collect()

In [None]:
#Make the train dataset to be a sequence of ids of words.
for i in range(len(x_train)):
    temp = x_train[i].split()
    for word in temp[:]:
        if word not in count:
            temp.remove(word)
    for j in range(len(temp)):
        temp[j] = count[temp[j]]
    x_train[i] = temp

In [None]:
#Create evaluation dataset.
Xtrain, Xval, ytrain, yval = train_test_split(x_train, y_train, train_size=0.80, random_state=123)

In [None]:
#Pad sequence to 150 length.
train_x = sequence.pad_sequences(list(Xtrain), maxlen = maxlen)
val_x = sequence.pad_sequences(list(Xval), maxlen = maxlen)
test_x = sequence.pad_sequences(list(x_test), maxlen = maxlen)

In [None]:
#Save for easy loading.
pd.DataFrame(W).to_csv('./W.csv')

In [None]:
del embeddings_index
gc.collect()

In [None]:
#Save file.
pd.DataFrame(train_x).to_csv('./train_x.csv', sep = ',', index = False)
pd.DataFrame(val_x).to_csv('./val_x.csv', sep = ',', index = False)
pd.DataFrame(test_x).to_csv('./test_x.csv', sep = ',', index = False)
pd.DataFrame(ytrain).to_csv('./ytrain.csv', sep = ',', index = False)
pd.DataFrame(yval).to_csv('./yval.csv', sep = ',', index = False)
pd.DataFrame(x_train).to_csv('./x_train.csv', sep = ',', index = False)
pd.DataFrame(y_train).to_csv('./y_train.csv', sep = ',', index = False)