In [1]:
%load_ext autoreload

In [41]:
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import BernoulliNB

import os
import re

from data_preprocessing import PreprocessingSteps,CustomTextPreprocessor

In [3]:
data_dir='data/aclImdb'

In [4]:
def load_sentiment_data(data_type,data_dir):
    data = []
    for category in ["pos", "neg"]:
        category_dir = os.path.join(data_dir, data_type, category)
        for filename in os.listdir(category_dir):
            with open(os.path.join(category_dir, filename), 'r', encoding="utf-8") as file:
                numbers = re.findall(r'\d+', filename)
                rec_id = int(numbers[0])
                rating = int(numbers[1])
                text = file.read()
                data.append([rec_id,text, category,rating])
    return pd.DataFrame(data, columns=["rec_id","text", "sentiment","rating"])

In [5]:
train_df=load_sentiment_data("train",data_dir)

In [6]:
train_df.head()

Unnamed: 0,rec_id,text,sentiment,rating
0,4715,For a movie that gets no respect there sure ar...,pos,9
1,12390,Bizarre horror movie filled with famous faces ...,pos,8
2,8329,"A solid, if unremarkable film. Matthau, as Ein...",pos,7
3,9063,It's a strange feeling to sit alone in a theat...,pos,8
4,3092,"You probably all already know this by now, but...",pos,10


In [34]:
train_df['label']=train_df['sentiment'].apply(lambda x: 1 if x=='pos' else -1)

In [46]:
sample_data = pd.concat([train_df['text'].head(5), train_df['text'].tail(5)])

In [47]:
sample_labels = pd.concat([train_df['label'].head(5), train_df['label'].tail(5)])

In [48]:
obj=PreprocessingSteps(sample_data)

In [49]:
clean_data=obj.pre_process_all_steps()

In [50]:
clean_data.iloc[0]

'movie get respect sure lot memorable quote list gem imagine movie toe piscopo actually funny maureen stapleton scene steamer corona character absolute scream watch plan skipper pale jr police'

In [51]:
pure_transformation_pipeline = Pipeline(steps=[
           ('text_preproc', CustomTextPreprocessor()),('bow', CountVectorizer()),('tfidf', TfidfTransformer())])

In [28]:
tf_idf=pure_transformation_pipeline.fit_transform(sample_data)

In [29]:
tf_idf.shape

(10, 629)

In [33]:
tf_idf[0].toarray()

array([[0.        , 0.19797653, 0.        , 0.        , 0.        ,
        0.        , 0.16829814, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.14724099, 0.        , 0.        , 0.  

# Naive Bayes on sample Data

In [54]:
prediction_pipeline = Pipeline(steps=[
           ('text_preproc', CustomTextPreprocessor()),
           ('bow', CountVectorizer()),
           ('tfidf', TfidfTransformer()),
           ('bernoulli', BernoulliNB())])

In [55]:
prediction_pipeline.fit(sample_data,sample_labels)

In [56]:
y_pred = prediction_pipeline.predict(sample_data)

In [57]:
y_pred

array([ 1,  1,  1,  1,  1, -1, -1, -1, -1, -1])