In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import matplotlib.pyplot as plt

from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv1D, MaxPool1D, Dropout
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip','\t')
train.head()

In [None]:
train['Phrase'][0]

In [None]:
test=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip','\t')
test.head()

In [None]:
print(train.shape,test.shape)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train['Sentiment'].unique()

In [None]:
import seaborn as sns

sns.set()
sns.countplot(x='Sentiment',data=train)

In [None]:
train_df=train.drop(['PhraseId','SentenceId'],axis=1)
train_df.info()

In [None]:
for i in range(len(train_df['Phrase'])):
    train_df['Phrase'][i]=text_to_word_sequence(train_df['Phrase'][i])

In [None]:
for i in range(len(test['Phrase'])):
    test['Phrase'][i]=text_to_word_sequence(test['Phrase'][i])

In [None]:
tokenizer=Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['Phrase'])
train_df['Phrase']=tokenizer.texts_to_sequences(train_df['Phrase'])

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(test['Phrase'])
test['Phrase']=tokenizer.texts_to_sequences(test['Phrase'])

In [None]:
max_length = 100

train_copy = train_df['Phrase']
train_copy = sequence.pad_sequences(train_df['Phrase'],maxlen = max_length)


test_copy = test['Phrase']
test_copy = sequence.pad_sequences(test['Phrase'],maxlen = max_length)

In [None]:
print(train_copy.shape,test_copy.shape)

In [None]:
X=train_copy
y=pd.get_dummies(train_df['Sentiment'])

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3)

In [None]:
print(X_train.shape,y_train.shape)
print(X_val.shape,y_val.shape)

In [None]:
embedding_vector_length=32

model=Sequential()
model.add(Embedding(5000,
                    embedding_vector_length,
                    input_length=max_length))

model.add(Conv1D(filters=32,
                 kernel_size=3,
                 padding='same',
                 activation='relu'))

model.add(MaxPool1D(pool_size=2))

model.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(5,
                activation="softmax"))

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

print(model.summary())


In [None]:
early=EarlyStopping(monitor="val_acc",
                    min_delta=0.001,
                    patience=2,
                    mode='auto')
callback=[early]

In [None]:
model.fit(x=X_train,
          y=y_train,
        steps_per_epoch=125,
        epochs=15,
        validation_data=(X_val,y_val),
        callbacks=callback)

In [None]:
submission = pd.DataFrame()
submission['PhraseId'] = test['PhraseId']

In [None]:
prediction = model.predict(test_copy)
final_prediction = [np.argmax(i) for i in prediction]

In [None]:
submission['Sentiment'] = final_prediction
submission.head()

In [None]:
submission.to_csv('../working/submission.csv', index=False)