In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn import preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
stops = set(stopwords.words('english'))

In [None]:
train = pd.read_csv('train.csv')
print(len(train))
train.tail()

In [None]:
test = pd.read_csv('test.csv')
print(len(test))
test.tail()

In [None]:
label_enconder = preprocessing.LabelEncoder()
label_enconder.fit(train['author'])
train['label_encoded'] = label_enconder.transform(train['author'])
train.head()

## Preprocessing function

In [1]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    #filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    text = gensim.parsing.preprocessing.stem_text(text)
    return text

## Creating preprocessing column on train and testing

In [None]:
train['text_processed']=train['text'].apply(lambda x: transformText(x))
train.tail()

In [None]:
test['text_processed']=test['text'].apply(lambda x: transformText(x))
print(len(test))
test.tail()

## Train/Test split

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train['text_processed'], train['label_encoded'], test_size = 0.2, random_state = 4)
true_label = np.array(y_valid)
print("#" * 20 + " Some stats " + "#"*20)
print("Dataset training: {} uterances".format(x_train.shape[0]))
print("Dataset testing: {} uterances".format(x_valid.shape[0]))
print("Different classes: {}".format(len(y_train.unique())))

In [None]:
## Loading Glove vectors
embeddings_index = {}
f = open('../../vectors/glove.42B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
word = 'awesome'
print(embeddings_index[word].shape)
embeddings_index[word]

In [None]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [None]:
## Create sentence vectors for the dataset
xtrain_glove = [sent2vec(x) for x in tqdm(x_train)]
xvalid_glove = [sent2vec(x) for x in tqdm(x_valid)]

In [None]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [None]:
xtrain_glove.shape

In [None]:
xvalid_glove.shape

In [None]:
# scale the data before any neural net:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [None]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(y_train)
yvalid_enc = np_utils.to_categorical(y_valid)

## Building the model with simple 3 layer LSTM

In [None]:
model = Sequential()
model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.8))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.8))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.8))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=20, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc))