In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

import nltk
import os
import gc
import textblob
import keras.backend as K
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras import initializers, regularizers, constraints
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import *
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
#pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_colwidth', -1)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. loading Data

In [None]:
DATA_DIR = '../input/movie-review-sentiment-analysis-kernels-only'
train_file = os.path.join(DATA_DIR, 'train.tsv.zip')
test_file  = os.path.join(DATA_DIR, 'test.tsv.zip')
df_train = pd.read_table(train_file)
df_test  = pd.read_table(test_file)
sub=pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/sampleSubmission.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

# 2. Preprocessing Data

In [None]:
# Negation Handling
df_train.Phrase = df_train.Phrase.str.replace("n't", 'not')
df_test.Phrase = df_test.Phrase.str.replace("n't", 'not')

In [None]:
# Replacing Numbers
df_train.Phrase = df_train.Phrase.apply(lambda x: re.sub(r'[0-9]+', '0', x))
df_test.Phrase = df_test.Phrase.apply(lambda x: re.sub(r'[0-9]+', '0', x))

#x_train = df_train['Phrase'].values
#x_test  = df_test['Phrase'].values
#y_train = df_train['Sentiment'].values
#x = np.r_[x_train, x_test]

In [None]:
%%time
seed = 101 
np.random.seed(seed)

X = df_train['Phrase']
temp = df_test['Phrase']
y = to_categorical(df_train['Sentiment'])
num_classes = df_train['Sentiment'].nunique()
print("Number of classes:", num_classes)


In [None]:
# Spilt Train Test sets

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,stratify=y,random_state=seed)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
%%time
# Tokenize Text
max_features = 15000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
temp = tokenizer.texts_to_sequences(temp)

max_words = 50 
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
temp = sequence.pad_sequences(temp, maxlen=max_words)
print(X_train.shape,X_test.shape)


In [None]:
%%time
# Construct the Model
batch_size = 128
epochs = 10
def get_model(max_features, embed_dim, embedding_matrix):
    np.random.seed(seed)
    K.clear_session()
    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1],weights=[embedding_matrix]))#,trainable=False
    model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(num_classes, activation='softmax'))
    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model


In [None]:
%%time
# Transfer Learning Using GLOVE Embeddings

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
    
def get_embed_mat(EMBEDDING_FILE, max_features=20000):
    # word vectors
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf8'))
    print('Found %s word vectors.' % len(embeddings_index))

    # embedding matrix
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index) + 1)
    all_embs = np.stack(embeddings_index.values()) #for random init
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), (num_words, embed_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    max_features = embedding_matrix.shape[0]
    
    return max_features, embedding_matrix
    
# embedding matrix
#EMBEDDING_FILE = "../input/globalvectorsforwordrepresentation/glove.6B.100d.txt"
EMBEDDING_FILE = "../input/globalvectorsforwordrepresentation/glove.6B.200d.txt"
embed_dim = 200 #word vector dim
max_features, embedding_matrix = get_embed_mat(EMBEDDING_FILE)

In [None]:
%%time
# train the model
model = get_model(max_features, embed_dim, embedding_matrix)
model.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=epochs, batch_size=batch_size, verbose=2)


In [None]:
# Submission
sub['Sentiment'] =  model.predict_classes(temp, batch_size=batch_size, verbose=0)
sub.to_csv("submission.csv", index=False)
sub.head()
