# Sentiment Analysis using Bag of Words vectorization

In [59]:
#Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [60]:
#1. read the data provided in the same directory with name 'IMDB Dataset.csv' and store it in df variable
df = pd.read_csv("IMDB Dataset.csv")

#2. print the shape of the data
print(df.shape)

#3. print top 5 datapoints
print(df.head(5))

(50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [61]:
#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative
df["Category"] = df.sentiment.apply(lambda x: 1 if x == "positive" else 0)
print(df.head())

                                              review sentiment  Category
0  One of the other reviewers has mentioned that ...  positive         1
1  A wonderful little production. <br /><br />The...  positive         1
2  I thought this was a wonderful way to spend ti...  positive         1
3  Basically there's a family where a little boy ...  negative         0
4  Petter Mattei's "Love in the Time of Money" is...  positive         1


In [62]:
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
df.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [63]:
# Do the 'train-test' splitting with test size of 20%
X_train, X_test, y_train, y_test = train_test_split(df.review, df.Category, test_size = 0.2)

## 1. Create a classification pipeline using RandomForest to classify the movie review's positive or negative..

In [None]:
#1. create a pipeline object
vectorizer = CountVectorizer()

X_train_cv = vectorizer.fit_transform(X_train)
X_test_cv = vectorizer.transform(X_test)

#2. fit with X_train and y_train
model = RandomForestClassifier(n_estimators=50, criterion="entropy")
model.fit(X_train_cv, y_train)

#3. get the predictions for X_test and store it in y_pred
predictions = model.predict(X_test_cv)

In [None]:
# Test 1
print(X_test.iloc[0])
print(y_test.iloc[0])
# Incorrect prediction
model.predict(X_test_cv[0])

In [None]:
# Test 2: Correct prediction
print(X_test.iloc[1])
print(y_test.iloc[1])
# Incorrect prediction
model.predict(X_test_cv[1])

In [None]:
#4. print the classfication report
print(classification_report(y_test, predictions))

## 2. Create a classification pipeline using KNN to classify the movie review's positive or negative..

In [54]:
#1. create a pipeline object
vectorizer = CountVectorizer()

#2. fit with X_train and y_train
knn_model = KNeighborsClassifier(n_neighbors=10, metric="euclidean")
knn_model.fit(X_train_cv, y_train)

#3. get the predictions for X_test and store it in y_pred
predictions = knn_model.predict(X_test_cv,)

'              precision    recall  f1-score   support\n\n           0       0.64      0.67      0.65      4877\n           1       0.67      0.64      0.65      5123\n\n    accuracy                           0.65     10000\n   macro avg       0.65      0.65      0.65     10000\nweighted avg       0.65      0.65      0.65     10000\n'

In [55]:
#4. print the classfication report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.64      0.67      0.65      4877
           1       0.67      0.64      0.65      5123

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000



## 3. Create a classification pipeline using Naive Bayes Algorithm to classify the movie review's positive or negative..

In [57]:
#1. create a pipeline object
vectorizer = CountVectorizer()

#2. fit with X_train and y_train
knn_model = MultinomialNB()
knn_model.fit(X_train_cv, y_train)

#3. get the predictions for X_test and store it in y_pred
predictions = knn_model.predict(X_test_cv,)


In [58]:
#4. print the classfication report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      4877
           1       0.88      0.81      0.84      5123

    accuracy                           0.84     10000
   macro avg       0.85      0.85      0.84     10000
weighted avg       0.85      0.84      0.84     10000



# Naive Bayes and Random Forest work better on the model as compared to KNN because 