In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip", sep="\t", quoting = 3)

In [None]:
test_df = pd.read_csv("/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip", sep="\t", quoting = 3)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df['Phrase'][0]

In [None]:
train_df['Sentiment'].value_counts()

In [None]:
train_df.shape

In [None]:
train_df=train_df.dropna()
print(train_df.shape)

In [None]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
ps=PorterStemmer()

In [None]:
def func(X):
    X=X.fillna(0)
    messages = X.copy()
    messages.reset_index(inplace=True)
    corpus=[]
    for i in range(len(messages)):
        review = re.sub('[^a-zA-Z]',' ',str(messages['Phrase'][i]))
        review = review.lower()
        review = review.split()
        review = [ps.stem(w) for w in review if w not in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
        
    return corpus

In [None]:
corpus_train = func(train_df)
corpus_test = func(test_df)

In [None]:
def get_wordlist(corpus1, corpus2):
    words = []
    for phrase in corpus1:
        for word in phrase.split():
            words.append(word)
            
    for phrase in corpus2:
        for word in phrase.split():
            words.append(word)
            
    words.sort()
    return set(words)

In [None]:
word_set = get_wordlist(corpus_train, corpus_test)

In [None]:
def get_dicts(word_set):
    word_to_index = {}
    word_to_index['<PAD>'] = 0
    word_to_index['<UNK>'] = 1
    for i, word in enumerate(word_set):
        word_to_index[word] = i  + 2
        
    index_to_word = {index:word for (word, index) in word_to_index.items()}
    
    return word_to_index, index_to_word

In [None]:
word_to_index, index_to_word = get_dicts(word_set)

In [None]:
def token(corpus, word_to_index):
    tokenized_list = []
    
    for phrase in corpus:
        tokenized_format = []
        for word in phrase.split():
            index = word_to_index[word]
            tokenized_format.append(index)
            
        tokenized_list.append(tokenized_format)
        
    return np.array(tokenized_list, dtype='object')

In [None]:
X_train = token(corpus_train, word_to_index)
X_test = token(corpus_test, word_to_index)
maxlen = 30
X_train_padded = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test_padded = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')
y_train = train_df.iloc[:,-1].values

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
y_train_ohe = ohe.fit_transform(y_train.reshape(-1, 1))

In [None]:
print(X_train_padded.shape)
print(y_train_ohe.shape)
print(X_test_padded.shape)

In [None]:
from keras.layers.embeddings import Embedding
def get_embedding_layer(word_to_index):
    vocab_size = len(word_to_index) + 1
    emb_dim = 30
    
    emb_layer = Embedding(input_dim= vocab_size, output_dim=emb_dim, trainable=True)
    emb_layer.build((None, ))
    return emb_layer 

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Dropout

In [None]:
model = Sequential()
model.add(get_embedding_layer(word_to_index))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128))
model.add(Dense(5, activation='softmax'))

model.summary()

In [None]:
model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [None]:
model.fit(X_train_padded, y_train_ohe, batch_size=256, epochs=20 )

In [None]:
pred = model.predict(X_test_padded)
result_pred = ohe.inverse_transform(pred>=0.5)
submit = pd.DataFrame({'PhraseId': test_df.iloc[:,0].values,
                      'Sentiment': result_pred.reshape(1,-1)[0]})

In [None]:
submit.set_index('PhraseId', inplace = True)

In [None]:
submit.to_csv('Submission.csv')