In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

from itertools import count
import keras
from nltk.stem import WordNetLemmatizer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import label_binarize
from sklearn.utils.class_weight import compute_sample_weight

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
with open('../input/train.tsv') as trainfile:
    train_data = pd.read_csv(trainfile, sep='\t')
with open('../input/test.tsv') as testfile:
    test_data = pd.read_csv(testfile, sep='\t')
with open('../input/sampleSubmission.csv') as submissionfile:
    submission_data = pd.read_csv(submissionfile)

In [None]:
def create_vocabulary(df):
    counter = count(2)  # index 0 reserved for padding, index 1 for UNK token
    vocabulary = dict()
    lemmatizer = WordNetLemmatizer()
    for k in df['Phrase']:
        tokens = k.lower().split(" ")
        for token in tokens:
            lemmatoken = lemmatizer.lemmatize(token)
            if lemmatoken in vocabulary:
                continue
            vocabulary[lemmatoken] = next(counter)
    print("Vocabulary length: {}".format(max(vocabulary.values())))  # 16532
    return vocabulary

In [None]:
def check_word_length(df):
    lengths = []
    for k in df['Phrase']:
        tokens = k.lower().split(" ")
        wordlength = len(tokens)
        lengths.append(wordlength)
    print("Max sentence length: {}".format(max(lengths)))
    return max(lengths)

In [None]:
def preprocess_df(df, vocabulary, max_sentence_length):
    vocabulary_length = max(vocabulary.values())
    X = []
    # Use the same function for test sets.
    Y = label_binarize(df.Sentiment.to_xarray(), classes=[0, 1, 2, 3, 4]) if 'Sentiment' in df else None
    lemmatizer = WordNetLemmatizer()
    for sample in df.iterrows():
        tokens = sample[1]['Phrase'].lower().split(" ")
        vocab_tokens = []
        for i in range(max_sentence_length):
            try:
                vocab_tokens.append(vocabulary.get(lemmatizer.lemmatize(tokens[i]), 1))  # 1 - UNK token
            except IndexError:
                vocab_tokens.append(0)  # 0 - padding token
        X.append(vocab_tokens)
    return np.asarray(X), Y

In [None]:
adam = keras.optimizers.Adam(lr=0.01, amsgrad=True, decay=0.99)
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=15189, output_dim=10, mask_zero=True))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.LSTM(1000, return_sequences=True, activation='relu'))
model.add(keras.layers.Dropout(rate=0.05))
model.add(keras.layers.LSTM(250, return_sequences=True, activation='relu'))
model.add(keras.layers.Dropout(rate=0.05))
model.add(keras.layers.LSTM(100, return_sequences=True, activation='relu'))
model.add(keras.layers.Dropout(rate=0.05))
model.add(keras.layers.LSTM(50, return_sequences=True, activation='relu'))
model.add(keras.layers.Dropout(rate=0.05))
model.add(keras.layers.LSTM(10, activation='relu'))
model.add(keras.layers.Dense(5, activation='softmax'))
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
vocabulary = create_vocabulary(train_data)
train_X, train_Y = preprocess_df(train_data, vocabulary, 52)
# sample_weight = compute_sample_weight('balanced', train_data['Sentiment'])

In [None]:
model.fit(x=train_X, y=train_Y, batch_size=250, epochs=10,
#          sample_weight=sample_weight
         )

In [None]:
ver_X, _ver_Y = preprocess_df(test_data, vocabulary, 52)

In [None]:
predictions = model.predict(x=np.asarray(ver_X))

In [None]:
prediction_results = pd.concat([test_data,
                                pd.DataFrame([np.argmax(k) for k in predictions], columns=['Sentiment'])],
                               axis=1)

In [None]:
submission = prediction_results[['PhraseId', 'Sentiment']]
submission.to_csv('submission.csv', index=False)