In [1]:
# import library
import numpy as np 
import pandas as pd 
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.models import Model
from keras.callbacks import EarlyStopping



Using TensorFlow backend.


In [None]:
train = pd.read_csv("../input/movie-review-sentiment-analysis-kernels-only/train.tsv", sep="\t")
test = pd.read_csv("../input/movie-review-sentiment-analysis-kernels-only/test.tsv", sep="\t")

enc = OneHotEncoder(sparse=False)
enc.fit(train["Sentiment"].values.reshape(-1, 1))

In [None]:
test = pd.merge(test, train[["Phrase", "Sentiment"]], on="Phrase", how="left")

In [None]:
cv_1 = CountVectorizer()
cv_1.fit(train["Phrase"])

cv_2 = CountVectorizer()
cv_2.fit(test["Phrase"])

all_words = set(cv_1.vocabulary_.keys()).union(set(cv_2.vocabulary_.keys()))


In [None]:
def transform(df):
    df["phrase_count"] = df.groupby("SentenceId")["Phrase"].transform("count")
    df["word_count"] = df["Phrase"].apply(lambda x: len(x.split()))
    df["Phrase"] = df["Phrase"].apply(lambda x: x.lower())
    return df

train = transform(train)
test = transform(test)

dense_features = ["phrase_count", "word_count", "Phrase"] 
                 

In [None]:
NUM_FOLDS = 3

train["fold_id"] = train["SentenceId"].apply(lambda x: x%NUM_FOLDS)

EMBEDDING_FILE = "../input/fatsttext-common-crawl/crawl-300d-2M/crawl-300d-2M.vec"
EMBEDDING_DIM = 300


def get_embedding():
    all_index = {}
    f = open(EMBEDDING_FILE)
    for line in f:
        line_values = line.split()
        word = line_values[0]
        if len(line_values) == EMBEDDING_DIM + 1 and word in all_words:
            coef_values = np.asarray(line_values[1:], dtype="float32")
            all_index[word] = coef_values
    f.close()
    return all_index

embeddings_index = get_embedding()


In [None]:
MAX_SEQUENCE_LENGTH = 100

tokenizer = Tokenizer(filters = "")
tokenizer.fit_on_texts(np.append(train["Phrase"].values, test["Phrase"].values))
word_index = tokenizer.word_index

nb_words = len(word_index) + 1
embedding_mat = np.zeros((nb_words, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_mat[i] = embedding_vector
        
seq = pad_sequences(tokenizer.texts_to_sequences(train["Phrase"]), maxlen = MAX_SEQUENCE_LENGTH)
test_seq = pad_sequences(tokenizer.texts_to_sequences(test["Phrase"]), maxlen = MAX_SEQUENCE_LENGTH)

In [2]:
def build_model():
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights = [embedding_mat],
                                input_length = MAX_SEQUENCE_LENGTH,
                                trainable = True)
    
    dropout = SpatialDropout1D(0.3)
    mask_layer = Masking()
    lstm_layer = LSTM(80)
    
    seq_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    dense_input = Input(shape=(len(dense_features),))
    
    dense_vector = BatchNormalization()(dense_input)
    
    phrase_vector = lstm_layer(mask_layer(dropout(embedding_layer(seq_input))))
    
    feature_vector = concatenate([phrase_vector, dense_vector]) ## joining features
    feature_vector_2 = Dense(80, activation="relu")(feature_vector)
    feature_vector_3 = Dense(40, activation="relu")(feature_vector_2)
    #dropout = SpatialDropout1D(0.3)
    #feature_vector_4 = Dense(20, activation="relu")(feature_vector_3)
    
    output = Dense(5, activation = "softmax")(feature_vector_3)
    
    model = Model(inputs = [seq_input, dense_input], outputs=output)
    return model

In [3]:
model = build_model()

NameError: global name 'nb_words' is not defined

In [None]:
test_preds = np.zeros((test.shape[0], 5))

for i in range(NUM_FOLDS):
    train_seq, val_seq = seq[train["fold_id"] != i], seq[train["fold_id"] == i]
    train_dense, val_dense = train[train["fold_id"] != i][dense_features], train[train["fold_id"] == i][dense_features]
    y_train = enc.transform(train[train["fold_id"] != i]["Sentiment"].values.reshape(-1, 1))
    y_val = enc.transform(train[train["fold_id"] == i]["Sentiment"].values.reshape(-1, 1))
    
    model = build_model()
    model.compile(loss = "categorical_crossentropy", 
                  optimizer = "nadam", 
                  metrics = ["acc"])
    
    early_stopping = EarlyStopping(monitor="val_acc", patience=2, verbose=1)
    
    print("Training the model...")
    model.fit([train_seq, train_dense], 
              y_train, validation_data=([val_seq, val_dense], y_val),
              epochs= 10, batch_size= 512, ###change epoch value, use early stopping criteria
              shuffle = True, callbacks=[early_stopping], 
              verbose = 1)
    
    print("Predicting...")
    test_preds += model.predict([test_seq, test[dense_features]], batch_size=1024, verbose=1)
    print()
    
test_preds /= NUM_FOLDS

In [None]:
test["pred"] = test_preds.argmax(axis=1)

test.loc[test["Sentiment"].isnull(), "Sentiment"] = test.loc[test["Sentiment"].isnull(), "pred"]

test["Sentiment"] = test["Sentiment"].astype(int)
test[["PhraseId", "Sentiment"]].to_csv("submission.csv", index=False)
