In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split


from tqdm import tqdm

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Bidirectional,LSTM,Activation,Conv1D,MaxPool1D,Dropout
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.utils import np_utils
np.random.seed(1)

%matplotlib inline


In [None]:
t = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip', sep = '\t')
t.head()

In [None]:
te = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/test.tsv.zip',sep = '\t')
te.head()

In [None]:
print("Train Data\n")
print(t.info())
print("***************************************")
print("Test Data\n")
te.info()

In [None]:
def clean_sentences(df):
    
    reviews = []
    for sent in tqdm(df['Phrase']):
        # removing non-alphabetical characters 
        text = re.sub("[^a-zA-Z]"," ",sent)
        
        # Now tokenizing the sentence : 
        words = word_tokenize(text.lower())
        
        #removing stop words :
        new_words = [ ele for ele in words if ele.lower() not in stopwords.words('english') ]
        
        # Lemmatizing each word to its lemma
        lem = WordNetLemmatizer()
        lem_words = [lem.lemmatize(i) for i in new_words]
        
        #finally
        reviews.append(lem_words)
        
    return(reviews)

In [None]:
%%time
train_sentences = clean_sentences(t)
test_sentences = clean_sentences(te)

print(len(train_sentences))
print(len(test_sentences))

In [None]:
y_target = to_categorical(t['Sentiment'].values)
y_target.shape

In [None]:

X_train,X_val,y_train,y_val = train_test_split(train_sentences,y_target,test_size = 0.2,stratify = y_target)

unique_words = set()
len_max = -1

for sent in tqdm(X_train):
    unique_words.update(sent)
    if(len_max < len(sent)):
        len_max = len(sent)
#Bag of words
print('Words in vocab : ' , len(list(unique_words)))
print('Max_length : ' , len_max)



In [None]:
v = len(list(unique_words))
embedding_dim = 300
max_length = len_max
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'


In [None]:
%%time
tokenizer = Tokenizer(num_words = v,
                      # filters = '#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      oov_token = oov_tok,
                      # lower = True,
                      char_level = False)

tokenizer.fit_on_texts(list(X_train))

# Training
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,
                        maxlen = max_length,
                        padding = padding_type,
                        truncating = trunc_type)

# Validation
X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val,
                      maxlen = max_length,
                      padding = padding_type,
                      truncating = trunc_type)

# Testing
X_test = tokenizer.texts_to_sequences(test_sentences)
X_test = pad_sequences(X_test,
                       maxlen = max_length,
                       padding = padding_type,
                       truncating = trunc_type)

In [None]:
print("X_training shape   : ",X_train.shape)
print("X_validation shape : ",X_val.shape)
print("X_testing shape    : ",X_test.shape)

In [None]:
model = Sequential()
model.add(Embedding(v,embedding_dim,input_length = max_length))
model.add(Bidirectional(LSTM(128,dropout = 0.2, recurrent_dropout = 0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(64, dropout = 0.2, recurrent_dropout = 0.2, return_sequences=False)))
model.add(Dense(128,activation = 'relu'))
model.add(Dense(y_target.shape[1],activation = 'softmax'))

model.compile(loss = 'categorical_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'])
model.summary()

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='./model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
early_stopping = EarlyStopping(min_delta = 0.001,
                               mode = 'max',
                               monitor = 'val_acc',
                               patience = 2)
callback = [early_stopping]



In [None]:
%%time

num_epochs = 10

history = model.fit(X_train,y_train,
                    validation_data = (X_val, y_val),
                    epochs = num_epochs,
                    batch_size = 256,
                    verbose = 1,
                    callbacks = callback)



In [None]:
from matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()
plt.savefig('./accurac2y.png')

In [None]:
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.show()
plt.savefig('./loss.png')

In [None]:
#plt.savefig('./books_read.png')
# model.save('./new_model.h5')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# test_id = te['PhraseId']

# y_pred = np.argmax(model.predict(X_test), axis = -1)

# submission_df = pd.DataFrame({'PhraseId': test_id, 'Sentiment': y_pred})
# submission_df.to_csv('submission_.csv', index=False)
# submission_df.head()

import pickle
with open('./tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
max_length = 30
trunc_type = 'post'
padding_type = 'post'

import pickle
with open('./tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
X_test = tokenizer.texts_to_sequences(['Very good movie'])
from keras.preprocessing.sequence import pad_sequences
X_test = pad_sequences(X_test,
                       maxlen = max_length,
                       padding = padding_type,
                       truncating = trunc_type)
np.argmax(model.predict(X_test), axis = -1)

In [None]:
np.argmax(model.predict(X_test), axis = -1)

In [None]:
tokenizer = Tokenizer(num_words = v,
                      # filters = '#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      oov_token = oov_tok,
                      # lower = True,
                      char_level = False)

tokenizer.fit_on_texts(list(X_train))

In [None]:
max_length = 30
trunc_type = 'post'
padding_type = 'post'

In [None]:
from keras.models import load_model
import pickle
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
from keras.models import load_model
model = load_model('./new_model.h5')
# with open('../input/models/tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [None]:
Sentence = 'worst movie'

In [None]:
def sentiment(Sentence, tokenizer = tokenizer, model = model):
    Sentence = tokenizer.texts_to_sequences([Sentence])
    Sentence = pad_sequences(Sentence,
                           maxlen = max_length,
                           padding = padding_type,
                           truncating = trunc_type)
    ans = np.argmax(model.predict(Sentence), axis = -1)
    return ans[0]

In [None]:
sentiment(Sentence)

In [None]:
from keras.models import load_model
model = load_model('../input/models/model.h5')

max_length = 30
trunc_type = 'post'
padding_type = 'post'

import pickle
with open('../input/models/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
X_test = tokenizer.texts_to_sequences(['Very good movie'])
from keras.preprocessing.sequence import pad_sequences
X_test = pad_sequences(X_test,
                       maxlen = max_length,
                       padding = padding_type,
                       truncating = trunc_type)
np.argmax(model.predict(X_test), axis = -1)

In [None]:
print(1)