In [1]:
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

from gensim.models import KeyedVectors
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPool1D, Dense, Flatten

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
def load_training_data():
    data_df = pd.read_csv('train.tsv', sep='\t')
    x = data_df['Phrase'].values
    y = data_df['Sentiment'].values
    print('training data\'s len:', x.shape[0])
    return x, y

In [3]:
def load_testing_data():
    data_df = pd.read_csv('test.tsv', sep='\t')
    print('testing data\'s len:', len(data_df))
    return data_df

In [4]:
x_train, y_train = load_training_data()

training data's len: 156060


In [5]:
test_df = load_testing_data()
x_test = test_df.Phrase.values

testing data's len: 66292


In [6]:
tokenizer = Tokenizer()

In [7]:
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [8]:
x_train_seqs = tokenizer.texts_to_sequences(list(x_train))

In [9]:
word2idx = tokenizer.word_index

In [10]:
x_train_paded = pad_sequences(x_train_seqs, maxlen=52)

In [11]:
y_train_onehot = to_categorical(y_train)

In [12]:
def shuffle(x, y):
    indices = np.arange(x.shape[0])
    np.random.shuffle(indices)
    return x[indices], y[indices]

In [13]:
x_train_shuffled, y_train_shuffled = shuffle(x_train_paded, 
                                             y_train_onehot)

In [14]:
wv = KeyedVectors.load_word2vec_format('word2vec.6B.100d.txt')

In [15]:
embeddings = np.zeros((len(word2idx) + 1, 100))

In [16]:
for word, idx in word2idx.items():
    if word in wv.vocab:
        embeddings[idx] = wv.get_vector(word)

In [21]:
cnn_model = Sequential()

# (batch_size, seq_len)->(batch_size, steps, embedding_dim)
cnn_model.add(Embedding(embeddings.shape[0], 
                        embeddings.shape[1], 
                        weights=[embeddings], 
                        trainable=False, input_length=52))

# (batch_size, steps, embedding_dim)->(batch_size, new_steps, filters)
cnn_model.add(Conv1D(64, 5, activation='relu'))

# (batch_size, new_steps, filters)->(batch_size, downsampled_steps, filters)
cnn_model.add(MaxPool1D(2, strides=2))

# (batch_size, downsampled_steps, filters)->(batch_size, downsampled_steps*filters)
cnn_model.add(Flatten())

# batch_size, downsampled_steps*filters)->batch_size, 25)
# cnn_model.add(Dense(25, activation='relu'))

cnn_model.add(Dense(5, activation='softmax'))

In [22]:
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', 
                  metrics=['accuracy'])

In [23]:
cnn_model.fit(x_train_shuffled, y_train_shuffled, batch_size=256, 
              epochs=15, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f4ba21fbb00>

In [24]:
x_test_seqs = tokenizer.texts_to_sequences(list(x_test))
x_test_paded = pad_sequences(x_test_seqs, maxlen=52)
test_pred = cnn_model.predict_classes(x_test_paded)
test_df['Sentiment'] = test_pred.reshape(-1, 1)
test_df.to_csv('cnn-word2vec.csv', columns=['PhraseId', 'Sentiment'], 
               index=False, header=True)

