In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Importing all required libraries
import nltk
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
raw_data = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
raw_data.head()

In [None]:
raw_data.columns

In [None]:
raw_data.count()

In [None]:
sentiment = {'positive': 1,'negative':0}

In [None]:
raw_data['sentiment'] = raw_data['sentiment'].map(sentiment)

In [None]:
raw_data.head()

In [None]:
pos_sent_count = len(raw_data[raw_data['sentiment']==1])
neg_sent_count = len(raw_data[raw_data['sentiment']==0])

In [None]:
# Visualizing the data distribution
plt.figure(figsize=(6,4))
plt.bar(['Positive Sentiment','Negative Sentiment'],[pos_sent_count,neg_sent_count])
plt.xlabel("Class")
plt.ylabel("Number of Tweets")

In [None]:
def dataCleaning(text):
    text = text.lower()
    text = re.sub("[\n\t]","",text)
    text = re.sub("<br/?>","",text)
    text = re.sub("[%s]"%re.escape(string.punctuation),"",text)
    return text

raw_data['review'] = raw_data.apply(lambda row:dataCleaning(row['review']),axis=1)

In [None]:
raw_data.head()

In [None]:
raw_data['tokenized_review']=raw_data.apply(lambda row:word_tokenize(row['review']),axis=1)

In [None]:
raw_data.head()

In [None]:
raw_data['tokenized_review']=raw_data.apply(lambda row: [word for word in row['tokenized_review'] if word not in stopwords.words('english') ],axis=1)

In [None]:
raw_data.head()

In [None]:
review_length = []
raw_data.apply(lambda row: review_length.append(len(row['tokenized_review'])),axis=1)

In [None]:
plt.figure(figsize=(8,6))
plt.hist(review_length,bins=40)
plt.xlabel("Review Length (Number of Words)")
plt.ylabel("Number of Tweets")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(raw_data['tokenized_review'],raw_data['sentiment'],test_size=0.2)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
tokenizer = Tokenizer(num_words=20000)

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
train_data = tokenizer.texts_to_sequences(X_train)

In [None]:
train_data[0]

In [None]:
test_data = tokenizer.texts_to_sequences(X_test)
test_data[0]

In [None]:
max_len = 200
train_data = pad_sequences(train_data,maxlen=max_len)
test_data = pad_sequences(test_data,maxlen=max_len)

In [None]:
y_train = y_train.to_list()
y_test = y_test.to_list()

In [None]:
from tensorflow.keras.layers import Dense,LSTM,Dropout,Input,Activation,Embedding,GlobalMaxPool1D,GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.models import Sequential, Model
from sklearn.metrics import roc_auc_score

In [None]:
# Creating LSTM Model
inp = Input(shape=(max_len,))
x = Embedding(20000,64)(inp)
x = LSTM(64,return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.3)(x)
x = Dense(64,activation="relu")(x)
x = Dropout(0.4)(x)
x = Dense(1,activation="sigmoid")(x)
model = Model(inputs=inp,outputs=x)

In [None]:
model.compile(SGD(0.005,momentum=0.9),loss="binary_crossentropy",metrics=["accuracy"])

In [None]:
# Defining some hyperparameters
batch_size = 32
epochs = 5

In [None]:
history = model.fit(train_data,y_train,batch_size,validation_split=0.2,epochs=20)

In [None]:
y_test = np.array(y_test)

In [None]:
loss, accuracy = model.evaluate(test_data,y_test)

In [None]:
print("Your model attained an accuracy of {}% and a loss of {} on the test dataset".format(round(accuracy*100,2),round(loss,5)))

In [None]:
pred = model.predict(test_data)
roc_auc_score(y_test,pred)

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()