# Movie Review Sentiment Analysis

### Loading Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer, LancasterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy, re, string, unicodedata
from nltk.tokenize import ToktokTokenizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob, Word
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score




import os
import warnings
warnings.filterwarnings('ignore')

### Importing Dataset

In [None]:
imdb_data = pd.read_csv("IMDB Dataset.csv")
print(imdb_data.shape)
imdb_data.head()

FileNotFoundError: ignored

### EDA

In [None]:
imdb_data.describe()

In [None]:
imdb_data['sentiment'].value_counts()

### Split Traing and Testing Data

In [None]:
train_review = imdb_data.review[:45000]
train_sentiment = imdb_data.sentiment[:45000]

test_review = imdb_data.review[45000:]
test_sentiment = imdb_data.sentiment[45000:]

print(train_review.shape,train_sentiment.shape,test_review.shape,test_sentiment.shape)

### Text Normalization

In [None]:
tokenizer = ToktokTokenizer()
stopwords_list = nltk.corpus.stopwords.words('english')
print(stopwords_list)

### Remove HTML and Noise

>> #### Loking from scratch

In [None]:
doc = BeautifulSoup(imdb_data['review'][14],'html.parser')
doc = doc.get_text()
print(doc)

In [None]:
re.sub('\[[^]]*\]', '', doc+"[a]") #removing the square

In [None]:
def remove_html_and_noise(doc):
    doc = BeautifulSoup(doc,'html.parser')
    doc = doc.get_text()
    doc = re.sub('\[[^]]*\]', '', doc+"[a]") #removing the square
    return doc

In [None]:
imdb_data['review'] = imdb_data['review'].apply(remove_html_and_noise)

### Removing Special Character

In [None]:
def removing_special_character(doc): #without space as special character
    doc = re.sub(r"[^A-Za-z0-9\s]",'',doc)
    return doc

In [None]:
imdb_data['review'] = imdb_data['review'].apply(removing_special_character)

### Text Stemming

In [None]:
def simple_potter_stemmer(doc):
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in doc.split()])

In [None]:
imdb_data['review'] = imdb_data['review'].apply(simple_potter_stemmer)

### Removing Stop Words

In [None]:
def removing_stop_words(doc):
    stop_words = set(stopwords.words('english'))
    stop_words_list = list(stop_words)
    tokens = tokenizer.tokenize(doc)

    tokens = [token.strip() for token in tokens] #token.strip is rem=moving extra space

    tokens = [token for token in tokens if token not in stop_words_list]

    return ' '.join(tokens)

In [None]:
imdb_data['review'] = imdb_data['review'].apply(removing_stop_words)

### Normalized train and test dataset

In [None]:
normalized_train_review = imdb_data['review'][:45000]
normalized_test_review = imdb_data['review'][45000:]

In [None]:
train_sentiment_labels = imdb_data['sentiment'][:45000]
test_sentiment_labels = imdb_data['sentiment'][45000:]

<h1 style="color:green">Text to Vector Transformation</h1>

### `Bag of Words Model`

In [None]:
cv = CountVectorizer(min_df=0, max_df=1,binary=False,ngram_range=(1,3))
cv_train_review = cv.fit_transform(normalized_train_review)
cv_test_review = cv.transform(normalized_test_review)

print("Train Shape : ", cv_train_review.shape,"\nTest Shape : ",cv_test_review.shape)

In [None]:
len(list(cv.vocabulary_))

In [None]:
pickle.dump(cv,open("cv.pkl","wb"))

In [None]:
temp = cv.fit_transform([normalized_train_review[0]])

In [None]:
temp.shape

In [None]:
temp.toarray()

### `Term Frequency and Inverse Document Frequency Model`

In [None]:
tv = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))

tv_train_review = tv.fit_transform(normalized_train_review)
tv_test_review = tv.transform(normalized_test_review)

print("Train Shape : ", tv_train_review.shape,"\nTest Shape : ",tv_test_review.shape)

### `Labelling the sentiment texts`

In [None]:
lb = LabelBinarizer()
sentiment_label = lb.fit_transform(imdb_data['sentiment'])

print("Sentiment Label Shape :",sentiment_label.shape)

### `Split the sentiment labels`

In [None]:
train_sentiment_labels = sentiment_label[:45000]
test_sentiment_labels = sentiment_label[45000:]

<h1 style="color:green">Logistic Regression</h1>

### `Logistic Regression` in `BOW` and `Tfidf`

In [None]:
lr = LogisticRegression(max_iter=500, random_state=42)

#Fitting for Bag of Words
lr_bow = lr.fit(cv_train_review,train_sentiment)
#Fitting for Tfidf
lr_tfidf = lr.fit(tv_train_review,train_sentiment)

print(lr_bow)
print(lr_tfidf)

### `Logistic Regression` in `BOW` and `Tfidf` - `Prediction`

In [None]:
accuracy_score(test_sentiment,lr_bow.predict(cv_test_review))

In [None]:
accuracy_score(test_sentiment,lr_tfidf.predict(cv_test_review))

In [None]:
cv_test_review.shape

In [None]:
import pickle
pickle.dump(lr_bow,open("lr_bow.pkl","wb"))

In [None]:
lr_bow_predict = lr.predict(cv_test_review)
lr_tfidf_predict = lr.predict(tv_test_review)

### `Logistic Regression` in `BOW` and `Tfidf` - `Accuracy Score`

In [None]:
lr_bow_score = accuracy_score(test_sentiment,lr_bow_predict)
lr_tfidf_score = accuracy_score(test_sentiment,lr_tfidf_predict)

print("BOW Accuracy : ", lr_bow_score, "\nTfidf Accuracy : ", lr_tfidf_score)

### `Classification Report`

In [None]:
lr_bow_report = classification_report(test_sentiment, lr_bow_predict, target_names=['Positive', 'Negative'])
lr_tfidf_report = classification_report(test_sentiment, lr_tfidf_predict, target_names=['Positive', 'Negative'])

print("BOW Report : \n", lr_bow_report,"\nTfidf Report : \n", lr_tfidf_report)

### `Confusion Matrix`

In [None]:
lr_bow_counfusion_matrix = confusion_matrix(test_sentiment,lr_bow_predict,labels=['positive','negative'])
lr_tfidf_counfusion_matrix = confusion_matrix(test_sentiment, lr_tfidf_predict, labels=['positive','negative'])

print("BOW :\n", lr_bow_counfusion_matrix,"\nTfidf : \n", lr_tfidf_counfusion_matrix)

<h1 style="color:green">Multinomial Naive Bayes</h1>

In [None]:
#training the model
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_review,train_sentiment)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(tv_train_review,train_sentiment)
print(mnb_tfidf)

#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_review)
# print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_review)
# print(mnb_tfidf_predict)

#Accuracy score for bag of words
mnb_bow_score=accuracy_score(test_sentiment,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(test_sentiment,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

<div>
    <h1 style="color:green">BERT : <span style="color:blue">Bidirectional Encoder Representations from Transformers</span></h1>
</div>

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

In [None]:
df = pd.read_csv("IMDB Dataset.csv")

In [None]:
df.head()

In [None]:
df['positive'] = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

In [None]:
df.head()

In [None]:
df.positive.value_counts()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['review'],df['positive'],test_size=0.1)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
normalized_test_review.shape,normalized_train_review.shape,test_sentiment.shape,train_sentiment.shape

In [None]:
y_train

In [None]:
imdb_data['sentiment'] = imdb_data['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

In [None]:
sentiment_train = imdb_data['sentiment'][:25000]
sentiment_test = imdb_data['sentiment'][25000:]

In [None]:
imdb_data

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
model.fit(normalized_train_review, sentiment_train, epochs=10)

In [None]:
import pickle

In [None]:
pickle.dump(model,open("model1.pkl","wb"))