<a href="https://colab.research.google.com/github/sanislearning/ml_playground/blob/main/IMDBRatings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import classification_report,accuracy_score,f1_score

In [2]:
from google.colab import files
uploaded=files.upload()

Saving aclImdb.zip to aclImdb.zip


In [3]:
import zipfile
with zipfile.ZipFile('aclImdb.zip','r') as zip_ref:
  zip_ref.extractall()

In [4]:
import os
print(os.listdir('aclImdb'))

['README', 'test', 'imdbEr.txt', 'train', 'imdb.vocab']


In [5]:
def load_reviews(directory,label):
  reviews=[]
  for filename in os.listdir(directory): #loops through all the files in the directory
    file_path=os.path.join(directory,filename) #gets the complete path to each review file
    with open(file_path,encoding='utf-8') as file: #Open file and read using UTF-8 encoding
      reviews.append((file.read(),label))
  return reviews

pos_reviews=load_reviews('aclImdb/train/pos',1) #loads positive reviews and assign them to the label 1
neg_reviews=load_reviews('aclImdb/train/neg',0) #loads negative reviews and assigns them to the label 0

In [6]:
all_reviews=pos_reviews+neg_reviews
import random
random.shuffle(all_reviews) #mixes up the dataset so positive and negative reviews are mixed

In [7]:
df=pd.DataFrame(all_reviews,columns=['review','label'])
print(df.head())
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val=train_test_split(
    df['review'],df['label'],test_size=0.2,random_state=42
)

                                              review  label
0  I am normally a Spike Lee fan. It takes some t...      0
1  I had to walk out on this film fifteen minutes...      0
2  Saw it as critic at the 49. Internationales Fi...      1
3  If this is based on the true-life relationship...      0
4  An older man touches a flower in his wife's gr...      0


#Training the Naive Bayes Classifer

In [8]:
#Since we have natural language text on one side and we need to train a model,
#we have to convert it into a numerical format that ML models can work with
from sklearn.feature_extraction.text import TfidfVectorizer #Term frequency inverse document frequency
vectorizer=TfidfVectorizer(stop_words='english',max_features=10000)
x_train_vec=vectorizer.fit_transform(x_train) #Learns the vocabulary and IDF weights from xtrain, converts each review into TFIDF weighted feature vector based on this learned vocabulary.
x_val_vec=vectorizer.transform(x_val)
#TF = Frequency of word in the document.
#IDF = log(Total docs / Docs containing the word).
#TFIDF= TF*IDF.


In [9]:
from sklearn.naive_bayes import MultinomialNB

model=MultinomialNB()
model.fit(x_train_vec,y_train)
y_pred=model.predict(x_val_vec)
print("Accuracy: ",accuracy_score(y_val,y_pred))
print(classification_report(y_val,y_pred))

Accuracy:  0.8474
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      2505
           1       0.85      0.85      0.85      2495

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



In [10]:
#Using CountVectorizer instead of TFIDF
#Checks how many times each word appears in a document
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(stop_words='english',max_features=10000)
x_train_vec=vectorizer.fit_transform(x_train)
x_val_vec=vectorizer.transform(x_val) #transform using same vocabulary

In [11]:
nbmodel=MultinomialNB()
nbmodel.fit(x_train_vec,y_train)
y_pred=nbmodel.predict(x_val_vec)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy: 0.843
              precision    recall  f1-score   support

           0       0.84      0.85      0.84      2505
           1       0.85      0.83      0.84      2495

    accuracy                           0.84      5000
   macro avg       0.84      0.84      0.84      5000
weighted avg       0.84      0.84      0.84      5000



#Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

lr_model=LogisticRegression(max_iter=1000)
lr_model.fit(x_train_vec,y_train)
y_pred=lr_model.predict(x_val_vec)

print("Logistic Regression Accuracy: ",accuracy_score(y_val,y_pred))
print(classification_report(y_val,y_pred))

Logistic Regression Accuracy:  0.8574
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      2505
           1       0.86      0.85      0.86      2495

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000

