In [None]:
import pandas as pd
import re

In [None]:
def load_training_data():
    data_df = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv', sep='\t')
    x = data_df['Phrase'].values
    y = data_df['Sentiment'].values
    print('training data\'s len:', x.shape[0])
    return x, y

In [None]:
def load_testing_data():
    data_df = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv', sep='\t')
    x = data_df['Phrase'].values
    print('testing data\'s len:', x.shape[0])
    return x

In [None]:
x_train, y_train = load_training_data()

In [None]:
x_test = load_testing_data()

In [None]:
print(x_train[:5])

In [None]:
print(y_train[:5])

In [None]:
print(x_test[:5])

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [None]:
x_train_seqs = tokenizer.texts_to_sequences(list(x_train))

In [None]:
print(x_train_seqs[:5])

In [None]:
word2idx = tokenizer.word_index

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
x_train_paded = pad_sequences(x_train_seqs)

In [None]:
print(x_train_paded.shape)

In [None]:
print(x_train_paded[:5])

In [None]:
from keras.utils import to_categorical

In [None]:
y_train_onehot = to_categorical(y_train)

In [None]:
print(y_train_onehot.shape)

In [None]:
print(y_train_onehot[:5])

In [None]:
import numpy as np

In [None]:
def shuffle(x, y):
    indices = np.arange(x.shape[0])
    np.random.shuffle(indices)
    return x[indices], y[indices]

In [None]:
x_train_shuffled, y_train_shuffled = shuffle(x_train_paded, 
                                             y_train_onehot)

In [None]:
print(x_train_shuffled[:5])

In [None]:
print(y_train_shuffled[:5])

In [None]:
from gensim.models import KeyedVectors

In [None]:
wv = KeyedVectors.load_word2vec_format('word2vec.6B.100d.txt')

In [None]:
embeddings = np.zeros((len(word2idx) + 1, 100))

In [None]:
'the' in wv.vocab

In [None]:
for word, idx in word2idx.items():
    if word in wv.vocab:
        embeddings[idx] = wv.get_vector(word)

In [None]:
print(embeddings[:5])

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, Activation

In [None]:
gru_model = Sequential()

In [None]:
gru_model.add(Embedding(embeddings.shape[0], 
                        100, 
                        weights=[embeddings], 
                        trainable=False))

In [None]:
gru_model.add(GRU(100, dropout=0.2, recurrent_dropout=0.2))
gru_model.add(Dense(5, activation='softmax'))

In [None]:
gru_model.compile(loss='categorical_crossentropy', optimizer='adam', 
                  metrics=['accuracy'])

In [None]:
gru_model.fit(x_train_shuffled, y_train_shuffled, batch_size=256, 
              epochs=10, verbose=1)

In [None]:
x_test_seqs = tokenizer.texts_to_sequences(x_test)

In [None]:
x_test_paded = pad_sequences(x_test_seqs)

In [None]:
test_pred = gru_model.predict_classes(x_test_paded)

In [None]:
print(test_pred)

In [None]:
test_df = pd.read_csv('test.tsv', sep='\t')

In [None]:
test_df['Sentiment'] = test_pred.reshape(-1, 1)

In [None]:
test_df.to_csv('gru-word2vec.csv', columns=['PhraseId', 'Sentiment'], 
               index=False, header=True)