In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip', '\t')

In [None]:
test = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip','\t')

In [None]:
train.isnull().sum() #no missing values

In [None]:
train.info()

In [None]:
test.isnull().sum() #no missing values

In [None]:
train.head()

In [None]:
train['Sentiment'].unique()

In [None]:
import seaborn as sns

sns.set()
sns.countplot(x='Sentiment',data=train)

In [None]:
#drop unecessary columns
train.drop(['PhraseId','SentenceId'],inplace = True,axis='columns')

In [None]:
from keras.preprocessing.text import text_to_word_sequence

#convert sentences to tokenized words
for i in range(len(train['Phrase'])):
    train['Phrase'][i] = text_to_word_sequence(train['Phrase'][i])
    

In [None]:
from keras.preprocessing.text import Tokenizer

#convert tokenized words to numeric form required for model building
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train['Phrase'])

train['Phrase'] = tokenizer.texts_to_sequences(train['Phrase'])

In [None]:
from keras.preprocessing.sequence import pad_sequences

#convert each tokenized review into an input of the same length = 100 by padding with 0s in the begining
max_length = 100
train_copy = train['Phrase']
train_copy = pad_sequences(train['Phrase'],maxlen=max_length)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
test.head()

In [None]:
submission = pd.DataFrame()
submission['PhraseId'] = test['PhraseId']

In [None]:
test.drop(['PhraseId''SentenceId'])

In [None]:
for i in range(len(test['Phrase'])):
    test['Phrase'][i] = text_to_word_sequence(test['Phrase'][i])
    
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(test['Phrase'])

test['Phrase'] = tokenizer.texts_to_sequences(test['Phrase'])

max_length = 100
test_copy = test['Phrase']
test_copy=pad_sequences(test['Phrase'],maxlen=max_length)

In [None]:
X = train_copy
y = pd.get_dummies(train['Sentiment'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

In [None]:
from keras.models import Sequential
from keras.layers import Dense,LSTM
from keras.layers.embeddings import Embedding

embedding_vector_length = 32

model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_vector_length, 
                    input_length=max_length))
model.add(LSTM(100))
model.add(Dense(5,activation = 'softmax'))

model.compile(loss = 'categorical_crossentropy',
                        optimizer = 'adam',
                        metrics=['accuracy']
                       )
print(model.summary())

In [None]:
train_history=model.fit(x=X_train,y=y_train,batch_size=64,epochs=10,verbose=2,validation_data=(X_val,y_val))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(5,5))
plt.plot(train_history.history['accuracy'],'r',label='Training accuracy')
plt.plot(train_history.history['val_accuracy'],'b',label='Validation accuracy')
plt.legend()

In [None]:
prediction = model.predict(test_copy)

In [None]:
final_prediction = [np.argmax(i) for i in prediction]

In [None]:
submission['Sentiment'] = final_prediction
submission.head()

In [None]:
submission.to_csv('../working/submission.csv', index=False)