In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
print(keras.__version__)

## Importing Dataset

In [None]:
dataset_train = pd.read_csv("/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip", sep="\t", quoting=3)
dataset_test = pd.read_csv("/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip", sep="\t", quoting=3)

In [None]:
dataset_train

In [None]:
dataset_test

In [None]:
dataset_train['Phrase'][0]

## Cleaning text


In [None]:
import nltk, re
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
def get_corpus(dataset):
    corpus = []

    for i in range(0, dataset.shape[0]):
        review = dataset['Phrase'][i]
        review = re.sub('[^A-Za-z]', ' ', review)
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        all_stopwords = stopwords.words("english")
        all_stopwords.remove("no")
        all_stopwords.remove("not")
        all_stopwords.remove("nor")
    
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    
        review = ' '.join(review)
    
        corpus.append(review)
        
    return corpus

In [None]:
corpus_train = get_corpus(dataset_train)
corpus_test = get_corpus(dataset_test)

## create word to index dictionary with all words in test and training set

In [None]:
def get_wordlist(corpus1, corpus2):
    words = []
    for phrase in corpus1:
        for word in phrase.split():
            words.append(word)
            
    for phrase in corpus2:
        for word in phrase.split():
            words.append(word)
            
    words.sort()
    return set(words)

In [None]:
word_set = get_wordlist(corpus_train, corpus_test)

In [None]:
def get_dicts(word_set):
    word_to_index = {}
    word_to_index['<PAD>'] = 0
    word_to_index['<UNK>'] = 1
    for i, word in enumerate(word_set):
        word_to_index[word] = i  + 2
        
    index_to_word = {index:word for (word, index) in word_to_index.items()}
    
    return word_to_index, index_to_word

In [None]:
word_to_index, index_to_word = get_dicts(word_set)

In [None]:
word_to_index

## Tokenize and pad the corpuses

In [None]:
def tokenize(corpus, word_to_index):
    tokenized_list = []
    
    for phrase in corpus:
        tokenized_format = []
        for word in phrase.split():
            index = word_to_index[word]
            tokenized_format.append(index)
            
        tokenized_list.append(tokenized_format)
        
    return np.array(tokenized_list, dtype='object')

In [None]:
X_train = tokenize(corpus_train, word_to_index)
X_test = tokenize(corpus_test, word_to_index)
maxlen = 30
X_train_padded = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test_padded = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')
y_train = dataset_train.iloc[:,-1].values
## One Hot Encode y_train and y_test
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
y_train_ohe = ohe.fit_transform(y_train.reshape(-1, 1))
print("X_train_padded shape: ", X_train_padded.shape)
print("y_train_ohe shape: ", y_train_ohe.shape)
print("X_test_padded shape: ", X_test_padded.shape)

## Building the RNN Model with Embedding layer

In [None]:
## Make an embedding layer 
def get_embedding_layer(word_to_index):
    vocab_size = len(word_to_index) + 1
    emb_dim = 30
    
    emb_layer = keras.layers.Embedding(input_dim= vocab_size, output_dim=emb_dim, trainable=True)
    emb_layer.build((None, ))
    return emb_layer 

## Build RNN
rnn = keras.models.Sequential()
rnn.add(get_embedding_layer(word_to_index))
rnn.add(keras.layers.LSTM(units=128, return_sequences=True))
rnn.add(keras.layers.Dropout(rate=0.5))
rnn.add(keras.layers.LSTM(units=128))
rnn.add(keras.layers.Dropout(rate=0.5))
rnn.add(keras.layers.Dense(units=120, activation='relu'))
rnn.add(keras.layers.Dense(units=5, activation='softmax'))

In [None]:
rnn.summary()

In [None]:
rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

## Training and predicting and saving result

In [None]:
rnn.fit(X_train_padded, y_train_ohe, batch_size=256, epochs=10)

In [None]:
y_pred_ohe = rnn.predict(X_test_padded)
y_pred = ohe.inverse_transform(y_pred_ohe >= 0.5)
submission_data = pd.DataFrame({'PhraseId': dataset_test.iloc[:,0].values, 'Sentiment': y_pred.reshape(1,-1)[0]})
submission_data.set_index('PhraseId', inplace=True)
submission_data.to_csv('submission.csv')