# Data Preparation

In [1]:
%load_ext autoreload

In [29]:
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import BernoulliNB

import os
import re

from data_preprocessing import PreprocessingSteps,CustomTextPreprocessor

In [3]:
data_dir='data/aclImdb'

In [4]:
def load_sentiment_data(data_type,data_dir):
    data = []
    for category in ["pos", "neg"]:
        category_dir = os.path.join(data_dir, data_type, category)
        for filename in os.listdir(category_dir):
            with open(os.path.join(category_dir, filename), 'r', encoding="utf-8") as file:
                numbers = re.findall(r'\d+', filename)
                rec_id = int(numbers[0])
                rating = int(numbers[1])
                text = file.read()
                data.append([rec_id,text, category,rating])
    return pd.DataFrame(data, columns=["rec_id","text", "sentiment","rating"])

In [5]:
train_df=load_sentiment_data("train",data_dir)

In [6]:
len(train_df)

25000

In [7]:
train_df.head()

Unnamed: 0,rec_id,text,sentiment,rating
0,4715,For a movie that gets no respect there sure ar...,pos,9
1,12390,Bizarre horror movie filled with famous faces ...,pos,8
2,8329,"A solid, if unremarkable film. Matthau, as Ein...",pos,7
3,9063,It's a strange feeling to sit alone in a theat...,pos,8
4,3092,"You probably all already know this by now, but...",pos,10


In [9]:
obj=PreprocessingSteps(train_df['text'])

In [None]:
train_df['clean_text']=obj.pre_process_all_steps() # Run time around 7hrs due to lemmatise and POS tagging

In [11]:
train_df.to_csv("data/clean_data/train_clean.csv")

In [12]:
train_df.head()

Unnamed: 0,rec_id,text,sentiment,rating,clean_text
0,4715,For a movie that gets no respect there sure ar...,pos,9,movie get respect sure lot memorable quote lis...
1,12390,Bizarre horror movie filled with famous faces ...,pos,8,bizarre horror movie fill famous face steal cr...
2,8329,"A solid, if unremarkable film. Matthau, as Ein...",pos,7,solid remarkable film matthau einstein wonderf...
3,9063,It's a strange feeling to sit alone in a theat...,pos,8,strange feeling sit alone theater occupy paren...
4,3092,"You probably all already know this by now, but...",pos,10,probably already know additional episode never...


#  Naive Bayes - BOW

In [15]:
# prediction_pipeline = Pipeline(steps=[
#            ('text_preproc', CustomTextPreprocessor()),
#            ('bow', CountVectorizer()),
#            ('bernoulli', BernoulliNB())])

In [16]:
prediction_pipeline = Pipeline(steps=[
           ('bow', CountVectorizer()),
           ('bernoulli', BernoulliNB())])

In [17]:
prediction_pipeline.fit(train_df['clean_text'],train_df['sentiment'])

# Loading Test Data

In [19]:
test_df=pd.read_csv("data/clean_data/test_clean.csv")

In [20]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,rec_id,text,sentiment,rating,clean_text
0,0,4715,"Based on an actual story, John Boorman shows t...",pos,9,base actual story john footman show struggle a...
1,1,1930,This is a gem. As a Film Four production - the...,pos,9,gem film production anticipated quality indeed...
2,2,3205,"I really like this show. It has drama, romance...",pos,9,really like show drama romance comedy roll one...
3,3,10186,This is the best 3-D experience Disney has at ...,pos,10,best experience kidney themeparks certainly go...
4,4,147,"Of the Korean movies I've seen, only three had...",pos,10,organ move see three really stick first excell...


In [21]:
y_pred = prediction_pipeline.predict(test_df['clean_text'])

In [23]:
accuracy_bow = accuracy_score(test_df['sentiment'], y_pred)

In [26]:
print("Accuracy (BoW):", accuracy_bow)

Accuracy (BoW): 0.82096


In [31]:
confusion_matrix_bow = confusion_matrix(test_df['sentiment'], y_pred,labels=["neg", "pos"])
print("Confusion Matrix (BoW):\n", confusion_matrix_bow)

Confusion Matrix (BoW):
 [[11024  1476]
 [ 3000  9500]]


In [32]:
classification_report_bow = classification_report(test_df['sentiment'], y_pred, labels=["neg", "pos"],target_names=['Negative', 'Positive'])
print("Classification Report (BoW):\n", classification_report_bow)

Classification Report (BoW):
               precision    recall  f1-score   support

    Negative       0.79      0.88      0.83     12500
    Positive       0.87      0.76      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.83      0.82      0.82     25000
weighted avg       0.83      0.82      0.82     25000

