In [50]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [51]:
data = pd.read_excel("imdb_processeddata.xlsx")
data.head()

Unnamed: 0.1,Unnamed: 0,Movie_Title,Movie_Release_Year,Movie_Genres,Review_User,Review_Date,Review_Rating,Review_Title,Review_Content,Review_Sentiment
0,0,The Shawshank Redemption,1994,['Drama'],hitchcockthelegend,2010-07-24,10,Some birds aren't meant to be caged.,shawshank redempt written direct frank darabon...,Positive
1,1,The Shawshank Redemption,1994,['Drama'],Sleepin_Dragon,2021-02-17,10,An incredible movie. One that lives with you.,wonder high rate quit liter breathtak say hasn...,Positive
2,2,The Shawshank Redemption,1994,['Drama'],EyeDunno,2005-11-21,10,Don't Rent Shawshank.,im tri save money last titl consid borrow rent...,Positive
3,3,The Shawshank Redemption,1994,['Drama'],alexkolokotronis,2008-02-18,10,This is How Movies Should Be Made,ordinari hollywood flick great deep messag fou...,Positive
4,4,The Shawshank Redemption,1994,['Drama'],kaspen12,2006-02-10,10,A classic piece of unforgettable film-making.,oscar year shawshank redempt written direct fr...,Positive


In [52]:
data['Movie_Title'] = data['Movie_Title'].astype('string')
data['Movie_Genres'] = data['Movie_Genres'].astype('string')
data['Review_User'] = data['Review_User'].astype('string')
data['Review_Title'] = data['Review_Title'].astype('string')
data['Review_Content'] = data['Review_Content'].astype('string')
data['Review_Sentiment'] = data['Review_Sentiment'].astype('string')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8970 entries, 0 to 8969
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Unnamed: 0          8970 non-null   int64         
 1   Movie_Title         8970 non-null   string        
 2   Movie_Release_Year  8970 non-null   int64         
 3   Movie_Genres        8970 non-null   string        
 4   Review_User         8970 non-null   string        
 5   Review_Date         8970 non-null   datetime64[ns]
 6   Review_Rating       8970 non-null   int64         
 7   Review_Title        8970 non-null   string        
 8   Review_Content      8970 non-null   string        
 9   Review_Sentiment    8970 non-null   string        
dtypes: datetime64[ns](1), int64(3), string(6)
memory usage: 700.9 KB


In [53]:
data = data[['Review_Content','Review_Sentiment']]
data.head()

Unnamed: 0,Review_Content,Review_Sentiment
0,shawshank redempt written direct frank darabon...,Positive
1,wonder high rate quit liter breathtak say hasn...,Positive
2,im tri save money last titl consid borrow rent...,Positive
3,ordinari hollywood flick great deep messag fou...,Positive
4,oscar year shawshank redempt written direct fr...,Positive


In [54]:
encoder = preprocessing.LabelEncoder()
data['Review_Sentiment'] = encoder.fit_transform(data.Review_Sentiment.values)

print(encoder.transform(["Positive", "Negative", "Neutral"]))
data.head()

[2 0 1]


Unnamed: 0,Review_Content,Review_Sentiment
0,shawshank redempt written direct frank darabon...,2
1,wonder high rate quit liter breathtak say hasn...,2
2,im tri save money last titl consid borrow rent...,2
3,ordinari hollywood flick great deep messag fou...,2
4,oscar year shawshank redempt written direct fr...,2


In [55]:
tf_idf_vectorizer = TfidfVectorizer()
X = tf_idf_vectorizer.fit_transform(data['Review_Content'])
y = data['Review_Sentiment']


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
model = LogisticRegression(random_state=7)
model.fit(X_train,y_train)

# model = MLPClassifier(random_state=7, max_iter=300).fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.82      0.88      0.85       622
           1       0.64      0.14      0.23       182
           2       0.87      0.95      0.90       990

    accuracy                           0.84      1794
   macro avg       0.77      0.66      0.66      1794
weighted avg       0.83      0.84      0.82      1794



In [58]:
def review_preprocessor(review):
    #Delete the "Was this review helpful? Sign in to vote."
    review = review.replace("Was this review helpful? Sign in to vote.","")

    # Remove zero or more occurrences of anything between "<" and ">" and new line "\n"
    html_capture = re.compile(r'<.*?>|\n')
    review = html_capture.sub(' ', review)

    # Remove punctutation and symbols
    # We basically find only English Word, Space and number characters and join them
    # Convert all characters to lowercase
    all_word_space_num = re.findall(r'[\w\s\d]', review, re.MULTILINE)
    all_word_space_num_lowered = [i.lower() for i in all_word_space_num]
    review = "".join(all_word_space_num_lowered)
    review = review.split(" ")

    #Removing stopwords
    stopwords_set = set(stopwords.words('english'))
    stopwords_set.add("film")
    stopwords_set.add("films")
    stopwords_set.add("movie")
    stopwords_set.add("movies")
    stopwords_set.add("one")
    removed_stopwords = []
    for i in review:
        if i  not in stopwords_set:
            removed_stopwords.append(i)

    #Applying Porter stemmer and joining all the words after removing stopwords
    st = PorterStemmer()
    review = ""
    for i in removed_stopwords:
        temp = st.stem(i)
        review += temp + " "

    return review

In [59]:
test_review = "Worst movie I have every watched. I will not reccomend this to anyone"

# Preprocess the query
sample = review_preprocessor(test_review)
print("After preprocessing the Query:", sample)
print("")

# Transform the query into tf-idf vector
sample = tf_idf_vectorizer.transform([sample])

# Precdict the sentiment of the query
prediction = model.predict(sample)
print("Model Prediction -", encoder.inverse_transform(prediction)[0])

After preprocessing the Query: worst everi watch reccomend anyon 

Model Prediction - Negative


Positive example - This was a great movie, I really liked the action scenes.<br>
<br>Neutral example - This was okay, and I  might watch it again<br>
<br>Negative example - Worst movie I have every watched. I will not reccomend this to anyone