In [1]:
import pandas as pd
import re

In [2]:
def load_training_data():
    data_df = pd.read_csv('train.tsv', sep='\t')
    x = data_df['Phrase'].values
    y = data_df['Sentiment'].values
    print('training data\'s len:', x.shape[0])
    return x, y

In [3]:
def load_testing_data():
    data_df = pd.read_csv('test.tsv', sep='\t')
    x = data_df['Phrase'].values
    print('testing data\'s len:', x.shape[0])
    return x

In [4]:
x_train, y_train = load_training_data()

training data's len: 156060


In [5]:
x_test = load_testing_data()

testing data's len: 66292


In [6]:
print(x_train[:5])

['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'
 'A series of escapades demonstrating the adage that what is good for the goose'
 'A series' 'A' 'series']


In [7]:
print(y_train[:5])

[1 2 2 2 2]


In [8]:
print(x_test[:5])

['An intermittently pleasing but mostly routine effort .'
 'An intermittently pleasing but mostly routine effort' 'An'
 'intermittently pleasing but mostly routine effort'
 'intermittently pleasing but mostly routine']


In [9]:
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [10]:
tokenizer = Tokenizer()

In [11]:
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [12]:
x_train_seqs = tokenizer.texts_to_sequences(list(x_train))

In [13]:
print(x_train_seqs[:5])

[[2, 315, 3, 16573, 7660, 1, 8313, 9, 53, 8, 47, 13, 1, 3940, 8, 187, 47, 13, 1, 13024, 61, 3, 89, 592, 12156, 19, 617, 3, 89, 2810, 5, 52, 3, 2, 42], [2, 315, 3, 16573, 7660, 1, 8313, 9, 53, 8, 47, 13, 1, 3940], [2, 315], [2], [315]]


In [14]:
word2idx = tokenizer.word_index

In [15]:
from keras.preprocessing.sequence import pad_sequences

In [17]:
x_train_paded = pad_sequences(x_train_seqs)

In [18]:
print(x_train_paded.shape)

(156060, 49)


In [22]:
print(x_train_paded[:5])

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     2   315     3 16573  7660     1  8313     9    53     8
     47    13     1  3940     8   187    47    13     1 13024    61     3
     89   592 12156    19   617     3    89  2810     5    52     3     2
     42]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     2
    315     3 16573  7660     1  8313     9    53     8    47    13     1
   3940]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     2
    315]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0    

In [19]:
from keras.utils import to_categorical

In [20]:
y_train_onehot = to_categorical(y_train)

In [21]:
print(y_train_onehot.shape)

(156060, 5)


In [23]:
print(y_train_onehot[:5])

[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [24]:
import numpy as np

In [25]:
def shuffle(x, y):
    indices = np.arange(x.shape[0])
    np.random.shuffle(indices)
    return x[indices], y[indices]

In [26]:
x_train_shuffled, y_train_shuffled = shuffle(x_train_paded, 
                                             y_train_onehot)

In [27]:
print(x_train_shuffled[:5])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0 1293  364    4 5419]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0 3429 9076   58  391    4 5730  952    5  125  154 1384
     7    1  498    4  198   45 1827]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0  829   90 9404    5 1702]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 

In [28]:
print(y_train_shuffled[:5])

[[0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]]


In [29]:
from gensim.models import KeyedVectors

In [54]:
wv = KeyedVectors.load_word2vec_format('word2vec.6B.100d.txt')

In [35]:
embeddings = np.zeros((len(word2idx) + 1, 100))

In [64]:
'the' in wv.vocab

True

In [65]:
for word, idx in word2idx.items():
    if word in wv.vocab:
        embeddings[idx] = wv.get_vector(word)

In [66]:
print(embeddings[:5])

[[ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.     

In [67]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, Activation

In [68]:
gru_model = Sequential()

In [69]:
gru_model.add(Embedding(embeddings.shape[0], 
                        100, 
                        weights=[embeddings], 
                        trainable=False))

In [70]:
gru_model.add(GRU(100, dropout=0.2, recurrent_dropout=0.2))
gru_model.add(Dense(5, activation='softmax'))

In [71]:
gru_model.compile(loss='categorical_crossentropy', optimizer='adam', 
                  metrics=['accuracy'])

In [72]:
gru_model.fit(x_train_shuffled, y_train_shuffled, batch_size=256, 
              epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3f7a04a470>

In [73]:
x_test_seqs = tokenizer.texts_to_sequences(x_test)

In [74]:
x_test_paded = pad_sequences(x_test_seqs)

In [75]:
test_pred = gru_model.predict_classes(x_test_paded)



In [78]:
print(test_pred)

[3 3 2 ... 1 1 1]


In [77]:
test_df = pd.read_csv('test.tsv', sep='\t')

In [79]:
test_df['Sentiment'] = test_pred.reshape(-1, 1)

In [80]:
test_df.to_csv('gru-word2vec.csv', columns=['PhraseId', 'Sentiment'], 
               index=False, header=True)