In [23]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['gloveembeddings', 'movie-review-sentiment-analysis-kernels-only']


In [24]:
from keras.models import Sequential
from keras.layers import CuDNNLSTM,Dense,Embedding,Dropout
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [25]:
train = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv', sep="\t")
test = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv', sep="\t")
sub = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/sampleSubmission.csv', sep=",")

In [26]:
df = train[['Phrase','Sentiment']]

In [27]:
df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [28]:
f = open('../input/gloveembeddings/glove.6B.300d.txt')
embedding_values = {}
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_values[word]= coef
f.close()

400000it [00:30, 13107.03it/s]


In [29]:
token  = Tokenizer()

In [30]:
x = df['Phrase']
y = df['Sentiment']
y = to_categorical(y)

In [31]:
token.fit_on_texts(x)

In [32]:
seq = token.texts_to_sequences(x)

In [33]:
pad_seq = pad_sequences(seq,maxlen=300)

In [34]:
vocab_size = len(token.word_index)+1
print(vocab_size)

15289


In [35]:
embedding_matrix = np.zeros((vocab_size,300))
for word,i in tqdm(token.word_index.items()):
    values = embedding_values.get(word)
    if values is not None:
        embedding_matrix[i] = values

100%|██████████| 15288/15288 [00:00<00:00, 201575.95it/s]


In [None]:
x_train,x_test,y_train,y_test = train_test_split(pad_seq,y,test_size = 0.3,random_state = 42)

In [82]:
model = Sequential()

In [83]:
model.add(Embedding(vocab_size,300,input_length=300,weights = [embedding_matrix],trainable = False))

In [84]:
model.add(CuDNNLSTM(75,return_sequences=True))
model.add(CuDNNLSTM(75,return_sequences=False))

In [85]:
model.add(Dense(128,activation = 'relu'))

In [86]:
model.add(Dense(5,activation='softmax'))

In [87]:
model.compile(optimizer='adam',loss = 'categorical_crossentropy',metrics=['accuracy'])

In [88]:
history = model.fit(x_train,y_train,batch_size=16,epochs = 5,validation_data=(x_test,y_test))

Train on 109242 samples, validate on 46818 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [89]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,3
1,156062,8545,An intermittently pleasing but mostly routine ...,3
2,156063,8545,An,2
3,156064,8545,intermittently pleasing but mostly routine effort,3
4,156065,8545,intermittently pleasing but mostly routine,3


In [55]:
test['Sentiment'] = ''

In [56]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,
1,156062,8545,An intermittently pleasing but mostly routine ...,
2,156063,8545,An,
3,156064,8545,intermittently pleasing but mostly routine effort,
4,156065,8545,intermittently pleasing but mostly routine,


In [57]:
testing_phrase = test['Phrase']

In [58]:
test_seq = token.texts_to_sequences(testing_phrase)

In [59]:
pad_test_seq = pad_sequences(test_seq,maxlen=300)

In [63]:
predict = model.predict_classes(pad_test_seq)

In [64]:
predict[0]

3

In [65]:
test['Sentiment']  = predict

In [66]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,3
1,156062,8545,An intermittently pleasing but mostly routine ...,3
2,156063,8545,An,2
3,156064,8545,intermittently pleasing but mostly routine effort,3
4,156065,8545,intermittently pleasing but mostly routine,3


In [73]:
submission = test[['PhraseId','Sentiment']]

In [74]:
submission.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3


In [None]:
submission.to_csv('Submission.csv',index = False)