# IMDB Review Sentimental Analysis

In [None]:
import pandas as pd
import numpy as np


In [None]:
data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
data.head()

In [None]:
data['sentiment'].value_counts()

In [None]:
data.dtypes

# Data Cleaning

In [None]:
import nltk
nltk.download("stopwords")
nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()


In [None]:
data['review'] = data['review'].str.lower()

In [None]:
#data['review'].replace("[^a-zA-Z]"," ",regex=True,inplace=True)

In [None]:
data['review'].replace('https?://\S+|www\.\S+'," ",regex=True,inplace=True)
data['review'].replace('<.*?>'," ",regex=True,inplace=True)
data['review'].replace('@\w+'," ",regex=True,inplace=True)
data['review'].replace('#\w+'," ",regex=True,inplace=True)
data['review'].replace("[^\w\s\d]"," ",regex=True,inplace=True)
data['review'].replace(r'( +)'," ",regex=True,inplace=True)
data['review'].replace("[^a-zA-Z]"," ",regex=True,inplace=True)

In [None]:
#train = data['review'][:40000]
#test = data['review'][40000:]

# Data Processing

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,LSTM
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
data_lem = []
for i in range(0,len(data.index)):
  temp = data['review'][i].split()
  temp = [word.lower() for word in temp]
  temp = [word for word in temp if word not in stopwords.words("english")]
  temp = " ".join(temp)
  data_lem.append(temp)

In [None]:
voc_size = 90000

In [None]:
from keras.preprocessing.text import Tokenizer
t = Tokenizer(num_words=voc_size,oov_token='<OOV>')
t.fit_on_texts(data_lem)
word_index=t.word_index
total_vocab=len(word_index)

In [None]:
total_vocab

In [None]:
train = t.texts_to_sequences(data_lem)

In [None]:
set_length = 700
embedded_docs_train = pad_sequences(train,padding='pre',maxlen =set_length)

In [None]:
from tensorflow.keras.layers import Flatten,Dropout
from tensorflow.keras.layers import BatchNormalization
from keras.layers import Bidirectional

In [None]:
import keras
opt = keras.optimizers.Adam(learning_rate=0.01)

# LSTM Model 

In [None]:
import keras
vector_feature = 200
model = Sequential()
model.add(Embedding(voc_size,vector_feature,input_length=set_length))
Dropout(0.20)
model.add(LSTM(64,return_sequences=True))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
labels = pd.get_dummies(data['sentiment'],drop_first=True)


In [None]:
labels['positive'] = labels['positive'].astype(int)

In [None]:
x_final = embedded_docs_train
y_final = labels['positive']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33)

In [None]:
y_test.shape

In [None]:
history = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=4,batch_size=64,verbose=1)

# Results

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'],label='train loss')
plt.plot(history.history['val_loss'],label='val loss')
plt.legend()
plt.show()

In [None]:

import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'],label='train accuracy')
plt.plot(history.history['val_accuracy'],label='val accuracy')
plt.legend()
plt.show()

In [None]:
predict = model.predict(X_test)
len(predict)

In [None]:
predict[0]

In [None]:
sent = []
for i in range(len(predict)):
    if predict[i] >= 0.5:
      sent.append(1)
    else:
      sent.append(0)


In [None]:
result = pd.DataFrame(list(zip(sent,np.array(y_test))),columns=['predict','actual'])

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(np.array(y_test),sent)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(np.array(y_test),sent)