## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
from collections import Counter
import tqdm as tqdm
from keras.layers import LSTM,Bidirectional,Flatten,Conv1D,Dense,Dropout,Embedding,MaxPooling1D
from keras.models import Sequential

# Reading Data 

In [None]:
df_train=pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv')
df_val=pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Valid.csv')
df_test=pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv')

df_train.shape,df_val.shape,df_test.shape

In [None]:
df_train.head()

In [None]:
X_train=df_train.text.values
X_val=df_val.text.values
X_test=df_test.text.values

Y_train=df_train.label.values
Y_val=df_val.label.values
Y_test=df_test.label.values

# Understanding Data

In [None]:
length_of_individual_review_train=[len(i.split()) for i in X_train]
length_of_individual_review_val=[len(i.split()) for i in X_val]
length_of_individual_review_test=[len(i.split()) for i in X_test]

## Histogram of Length of Reviews 

In [None]:
fig, axs = plt.subplots(3)
plt.figure(figsize=(10,10))
axs[0].hist(length_of_individual_review_train)
axs[1].hist(length_of_individual_review_val)
axs[2].hist(length_of_individual_review_test)

## Length of reviews in Training Data

In [None]:
df_review=pd.DataFrame()
df_review['train']=length_of_individual_review_train
df_review.describe()

## Length of Reviews in Val N Test Data

In [None]:
df_reviews=pd.DataFrame()
df_reviews['val']=length_of_individual_review_val
df_reviews['test']=length_of_individual_review_test
df_reviews.describe()

In [None]:
temp=' '.join(X_train)
words_count=Counter(temp.split())
words_count=sorted(words_count.values(),reverse=True)
words_count

## Looking at all distinct Characters present in reviews

Except alphanumeric characters, we can pass rest of characters as filter to our Tokenizer object

In [None]:
temp=' '.join(X_train)
temp=temp.lower()
for i in sorted(set(temp)):
    print(i,end='')

# Tokenizing

In [None]:
vocab_size=15000
embedding_dimension=32
max_length=120
turnc='post'
oov_tok='<OOV>'

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer=Tokenizer(filters='''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~¡¢£¤¦§¨«­®°³´·º»½¾¿ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþğıōżאגויכלמןר–‘’“”…″₤★、''',
                   num_words=vocab_size,
                   oov_token=oov_tok)

tokenizer.fit_on_texts(X_train)

word_index=tokenizer.word_index

X_train_sequences=tokenizer.texts_to_sequences(X_train)

X_train_padded=pad_sequences(X_train_sequences,
                            maxlen=max_length,
                            padding='post',
                            truncating=turnc)

In [None]:
X_test_sequences=tokenizer.texts_to_sequences(X_test)
X_test_padded=pad_sequences(X_test_sequences,
                            maxlen=max_length,
                            padding='post',
                           truncating=turnc)

In [None]:
X_val_sequences=tokenizer.texts_to_sequences(X_val)
X_val_padded=pad_sequences(X_val_sequences,
                            maxlen=max_length,
                           padding='post',
                           truncating=turnc)

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dimension, input_length=max_length),
    Dropout(0.3),
    Bidirectional(LSTM(120,return_sequences=False)),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
history=model.fit(X_train_padded,Y_train,epochs=4,validation_data=(X_val_padded,Y_val))

In [None]:
model.evaluate(X_test_padded,Y_test)

In [None]:
X_test_padded[0]

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

## Predicting from Model

In [None]:
test_case_1=['lovely movie']
tokenizer.fit_on_texts(test_case_1)
p=tokenizer.texts_to_sequences(test_case_1)
test_case_1=pad_sequences(p,maxlen=120)
model.predict_classes(test_case_1)

In [None]:
test_case_2=['boring movie']
tokenizer.fit_on_texts(test_case_2)
p=tokenizer.texts_to_sequences(test_case_2)
test_case_2=pad_sequences(p,maxlen=120)
model.predict_classes(test_case_2)

Thank you 