In [None]:
import pandas as pd
import numpy as np

import tensorflow as tf
import keras

tsv_file = '../input/movie-review-sentiment-analysis-kernels-only/train.tsv'
train_data = pd.read_table(tsv_file)

In [None]:
train_data

In [None]:
train_data = train_data.values

In [None]:
sentiments = []

for i in range(0, len(train_data)):
    sentiments.append(train_data[i][3])

In [None]:
sentiments

In [None]:
features = []

for i in range(0, len(train_data)):
    features.append(train_data[i][2])

In [None]:
features

In [None]:
sentences = features

In [None]:
max_features = 16467
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 55
batch_size = 32

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

tokenizer = Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

sentiments = to_categorical(sentiments)

In [None]:
from numpy import asarray
import os

embeddings_index = {}
f = open(os.path.join('../input/glove6b100dtxt', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
word_index = tokenizer.word_index

embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
train_features = sequences[0:134848]
train_targets = sentiments[0:134848]

val_features = sequences[134848:156060]
val_targets = sentiments[134848:156060]

In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Embedding, LSTM, Bidirectional, Masking

In [None]:
print('Pad sequences (samples x time)')
train_features = sequence.pad_sequences(train_features, maxlen=maxlen, padding='pre')
print('train_features shape:', train_features.shape)
train_features = np.array(train_features)

In [None]:
print('Pad sequences (samples x time)')
val_features = sequence.pad_sequences(val_features, maxlen=maxlen, padding='pre')
print('val_features shape:', val_features.shape)
val_features = np.array(val_features)

In [None]:
model = Sequential()

model.add(Embedding(len(word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            mask_zero=True,
                            trainable=False))

model.add(Bidirectional(LSTM(40, recurrent_dropout=0.2)))
model.add(BatchNormalization())

model.add(Dense(20, activation='sigmoid'))
model.add(Dropout(0.15))
model.add(BatchNormalization())

model.add(Dense(5, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(x=train_features, y=train_targets, epochs=30, batch_size=1000, validation_data=(val_features, val_targets))

In [None]:
tsv_file = '../input/movie-review-sentiment-analysis-kernels-only/test.tsv'
test_data = pd.read_table(tsv_file)

In [None]:
test_data = test_data.values

In [None]:
features = []

for i in range(0, len(test_data)):
    features.append(test_data[i][2])

In [None]:
features

In [None]:
sentences = features

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

test_tokenizer = Tokenizer(nb_words=max_features)
test_tokenizer.fit_on_texts(sentences)
test_sequences = test_tokenizer.texts_to_sequences(sentences)

test_sequences = sequence.pad_sequences(test_sequences, maxlen=maxlen)

In [None]:
predictions = model.predict(np.array(test_sequences))

class_predictions = []

for i in range(0, len(predictions)):
    class_predictions.append(list.index(list(predictions[i]), max(predictions[i])))

In [None]:
ids = list(test_data[:, 0])

In [None]:
submission = pd.DataFrame(np.transpose(np.array([ids, class_predictions])))

In [None]:
submission.columns = ['PhraseId', 'Sentiment']

In [None]:
submission

In [None]:
submission.to_csv('Movie-Review-Sentiment-Predictions-1.csv', index=False)

In [None]:
sentences

In [None]:
import allennlp

In [None]:
from allennlp.modules.elmo import Elmo, batch_to_ids

In [None]:
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)