# Predicting Movie review sentiments using Natural Language Processing

In [30]:
import pandas as pd
import numpy as np
import re

# step 1:

In [31]:
df=pd.read_csv("IMDB Dataset.csv") # reading csv file
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [32]:
df.review.shape

(50000,)

# step 2:

In [33]:
def reg(text):
    pattern=r"[^\w\s]"
    processed=re.sub(pattern,'',text)  #removing the punctuations using regex
    return processed

reg_review=df.review.apply(reg)

In [34]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [35]:
def preprocessed_text(datf):
    datf=datf.apply(lambda x: x.lower() if isinstance(x, str) else x) # making all the words in lower case
    datf=datf.apply(word_tokenize)        # tokenization using nltk
    stop_word=stopwords.words("english")  # storing all the stopwords
    datf=datf.apply(lambda x:[i for i in x if i not in stop_word])  #removing the stopwords
    stemr=PorterStemmer()
    datf=datf.apply(lambda x: [stemr.stem(i) for i in x])  # stemming using Porterstemmer

    datf=datf.apply(lambda x: " ".join(x))  # after all the preprocessing task putting it as string
    return datf

In [36]:
df["processed_review"]=preprocessed_text(reg_review)

In [38]:
df.head(5)

Unnamed: 0,review,sentiment,processed_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch 1 oz episod youll hoo...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product br br film techniqu unass...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...


In [39]:
df["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [40]:
df["sentiment_label"]=df["sentiment"].map({
    "positive":1,
    "negative":0
})                                          # creating new column and putting "1" for "positive" and "0" for "negative"

In [41]:
df.head()

Unnamed: 0,review,sentiment,processed_review,sentiment_label
0,One of the other reviewers has mentioned that ...,positive,one review mention watch 1 oz episod youll hoo...,1
1,A wonderful little production. <br /><br />The...,positive,wonder littl product br br film techniqu unass...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...,1
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...,1


# step 3:

In [42]:
X=df.processed_review
y=df.sentiment_label
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test=train_test_split(X, y, test_size=0.5) # splitting into train and test datasets

# step 4:

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [50]:
# Vectorization using tfidf

X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)

# step 5:

In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report

In [52]:
dt=DecisionTreeClassifier()
dt.fit(X_train_tfidf ,y_train)   # training the datasets 
y_pred_dt=dt.predict(X_test_tfidf)   # prediction of the test data set
print("Classification report:\n",classification_report(y_test,y_pred_dt))  # showing the classification report


Classification report:
               precision    recall  f1-score   support

           0       0.72      0.72      0.72     12509
           1       0.72      0.72      0.72     12491

    accuracy                           0.72     25000
   macro avg       0.72      0.72      0.72     25000
weighted avg       0.72      0.72      0.72     25000



In [53]:
gb=GradientBoostingClassifier()
gb.fit(X_train_tfidf ,y_train)    # training the datasets
y_pred_gb=gb.predict(X_test_tfidf)   # prediction of the test data set
print("Classification report:\n",classification_report(y_test,y_pred_gb))  # showing the classification report



Classification report:
               precision    recall  f1-score   support

           0       0.85      0.76      0.80     12509
           1       0.78      0.86      0.82     12491

    accuracy                           0.81     25000
   macro avg       0.82      0.81      0.81     25000
weighted avg       0.82      0.81      0.81     25000



In [54]:
rf=RandomForestClassifier()
rf.fit(X_train_tfidf ,y_train)   # training the datasets
y_pred_rf=rf.predict(X_test_tfidf) # prediction of the test data set
print("Classification report:\n",classification_report(y_test,y_pred_rf)) # showing the classification report



Classification report:
               precision    recall  f1-score   support

           0       0.84      0.85      0.85     12509
           1       0.85      0.84      0.84     12491

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

