In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing rapids libraries
import cudf
import cupy

#importing important libraries
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
import random
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy

from keras.models import Sequential


In [None]:
#importing csv file of the given data 
train=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip',sep='\t')
test=pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip',sep='\t')

**Understanding the given dataset**

The next few steps will help us understand the data in terms of shape, description, etc such that it is easier to work on its preprocessing.

In [None]:
print(train.shape,test.shape)
train.head(10)

The sentiment labels are:

0 - negative

1 - somewhat negative

2 - neutral

3 - somewhat positive

4 - positive

In [None]:
Sentiment_count=train.groupby('Sentiment').count()
plt.bar(Sentiment_count.index.values, Sentiment_count['Phrase'])
plt.xlabel('Review Sentiments')
plt.ylabel('Number of Review')
plt.show()

In [None]:
train.Sentiment.value_counts()

In [None]:
train.isnull().any().any()


In [None]:
test.isnull().any().any()

**Data Preprocessing**

This stage will be used to clean the data by removing html tags, non-alphabetic characters, etc and make the data

a. consistent and efficient

b. easier for the learning algorithm to parse

In [None]:
#Function for cleaning the reviews, tokenize and lemmatize them.

def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['Phrase']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

In [None]:
#retrieving cleaned reviews for both train and test set 

train_sentences = clean_sentences(train)
test_sentences = clean_sentences(test)

print(len(train_sentences))
print(len(test_sentences))

In [None]:
#Collect the dependent values and convert to one-hot encoded output using to_categorical

target=train.Sentiment.values
y_target=to_categorical(target)
num_classes=y_target.shape[1]

In [None]:
X_train,X_val,y_train,y_val = train_test_split(train_sentences,y_target,
                                             test_size=0.2,stratify=y_target)

In [None]:
#Geting the No. of unique words and max length of a review available in the list of cleaned reviews.

unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
print(len(list(unique_words)))
print(len_max)

In [None]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_sentences)


X_train = sequence.pad_sequences(X_train, maxlen=len_max)
X_val = sequence.pad_sequences(X_val, maxlen=len_max)
X_test = sequence.pad_sequences(X_test, maxlen=len_max)

print(X_train.shape,X_val.shape,X_test.shape)

In [None]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_acc', patience = 2)
callback = [early_stopping]

In [None]:
#Model using Keras LSTM

model=Sequential()

model.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model.add(LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
model.add(LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model.add(Dense(100,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer="adam",metrics=['accuracy','mean_squared_error'])

model.summary()

In [None]:
history=model.fit(X_train, y_train, validation_data=(X_val, y_val),
                  epochs=6, batch_size=256, verbose=1, callbacks=callback)

In [None]:
#model using cnn
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
tokenize = Tokenizer()
embedding_dimension = 100
input_val = len(tokenize.word_index)+1
model_CNN = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(list(unique_words)),300,input_length=len_max),
    tf.keras.layers.Conv1D(128, 2, padding='same',activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 2, padding='same',activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.summary()

In [None]:
model_CNN.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history=model.fit(X_train, y_train, validation_data=(X_val, y_val),
                  epochs=6, batch_size=256, verbose=1, callbacks=callback)

Therefore, we can clearly see that the accuracy in CNN model is more(76.15%) than in the lstm model(72.45%).