In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

### Loading

In [3]:
DATA_ROOT = Path("../../data")
TARGET_DATA = "IMDB Dataset.csv"

data = pd.read_csv(DATA_ROOT / "raw/imdb" / TARGET_DATA)
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
data["sentiment"] = data["sentiment"].replace({
  "negative": 0,
  "positive": 1,
})

  data["sentiment"] = data["sentiment"].replace({


### Processing

In [5]:
data["sentiment"].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [6]:
import nltk
from nltk.tokenize import ToktokTokenizer

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

In [7]:
import re
from bs4 import BeautifulSoup


def denoise_text(text: str) -> str:
    soup = BeautifulSoup(text, "html.parser")

    tmp = soup.get_text()
    return re.sub(r"\[[^]]*\]", '', tmp)


data["review"] = data["review"].apply(denoise_text)

In [8]:
def remove_special_characters(text: str) -> str:
    pattern=r'[^a-zA-z\s]'
    return re.sub(pattern, '', text)


data["review"] = data["review"].apply(remove_special_characters)

In [9]:
def simple_stemmer(text: str) -> str:
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

data["review"] = data["review"].apply(simple_stemmer)

In [10]:
data

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,1
1,a wonder littl product the film techniqu is ve...,1
2,i thought thi wa a wonder way to spend time on...,1
3,basic there a famili where a littl boy jake th...,0
4,petter mattei love in the time of money is a v...,1
...,...,...
49995,i thought thi movi did a down right good job i...,1
49996,bad plot bad dialogu bad act idiot direct the ...,0
49997,i am a cathol taught in parochi elementari sch...,0
49998,im go to have to disagre with the previou comm...,0


In [11]:
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))
print(stop)

def remove_stopwords(text: str) -> str:
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]

    return ' '.join(filtered_tokens)    


data["review"] = data["review"].apply(remove_stopwords)

{"couldn't", 'of', 'with', 'down', 'isn', 'mightn', 'have', 'herself', 'didn', "they'd", 'hadn', 'its', "he's", 'does', 'me', 'under', 'then', 'up', 'only', "weren't", 'what', 'will', 'below', 'wouldn', 'are', 'out', 'hers', 'o', "she's", "you'll", 'yours', 'had', 'between', 'her', "isn't", 'now', 'we', 'from', 've', 'at', "they've", 'whom', 'wasn', 'before', 'is', 'just', 'as', 'him', 'shan', 'yourself', 'm', "she'd", 'he', 'weren', 'again', 'that', 'theirs', "it'll", "we've", 'but', 'll', 'few', 'each', 'or', "we'll", "he'd", 'no', 'than', 'y', 'more', 'don', 'during', "he'll", "i'll", 'own', 'against', 're', 'haven', 'having', 'his', "i'd", 'over', 'further', 'yourselves', 'when', 'a', 'same', 'there', "hadn't", 'can', 'above', 'was', "mustn't", 'won', 'for', 'you', 'itself', "i've", 'why', "they're", 'do', 'once', 'be', 'couldn', "wasn't", "you're", 'i', 'ours', 'while', "mightn't", 'about', 'being', "you'd", 'all', 'doing', 'into', 'such', 'if', 'too', 'himself', "you've", 'other'

In [12]:
#normalized train reviews
# norm_train_reviews=imdb_data.review[:40000]
# norm_train_reviews[0]

#convert dataframe to string
# norm_train_string=norm_train_reviews.to_string()

#Spelling correction using Textblob
# norm_train_spelling=TextBlob(norm_train_string)
#norm_train_spelling.correct()

#Tokenization using Textblob
#norm_train_words=norm_train_spelling.words
#norm_train_words

data

Unnamed: 0,review,sentiment
0,one review ha mention watch oz episod youll ho...,1
1,wonder littl product film techniqu veri unassu...,1
2,thought thi wa wonder way spend time hot summe...,1
3,basic famili littl boy jake think zombi hi clo...,0
4,petter mattei love time money visual stun film...,1
...,...,...
49995,thought thi movi right good job wasnt creativ ...,1
49996,bad plot bad dialogu bad act idiot direct anno...,0
49997,cathol taught parochi elementari school nun ta...,0
49998,im go disagre previou comment side maltin thi ...,0


### Vectorization

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(
    # min_df=0.0, 
    # max_df=1.0,
    # ngram_range=(1,3),
)
embedded = vect.fit_transform(data["review"])

In [14]:
vect.get_feature_names_out()[600:700]

array(['abrupt', 'abruptli', 'abruptlydirect', 'abruptnessther',
       'absalom', 'abscbn', 'abscess', 'abscond', 'abseil', 'absenc',
       'absencein', 'absenceth', 'absens', 'absent', 'absentalic',
       'absentbut', 'absente', 'absenther', 'absentia', 'absentiaand',
       'absentmi', 'absentmind', 'absentminded', 'absentmindedli',
       'absentmindedyetcap', 'absentoh', 'absentsubplot', 'absentth',
       'absinth', 'abskani', 'absolom', 'absolout', 'absolu', 'absolut',
       'absolutelli', 'absolutelyconfus', 'absolutelyfantast',
       'absolutelysham', 'absolutelyugh', 'absolutey', 'absolutl',
       'absolutley', 'absolutli', 'absolv', 'absorb', 'absorbedlik',
       'absorbingli', 'absorpt', 'absoul', 'absoulutley', 'absout',
       'absoutley', 'abstain', 'abstin', 'abstinencethi', 'abstract',
       'abstractsurrealparallel', 'abstrus', 'absurd', 'absurda',
       'absurdabout', 'absurdand', 'absurdanesthesiologist',
       'absurdanoth', 'absurdbut', 'absurddiari', 'ab

In [15]:
embedded

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4915462 stored elements and shape (50000, 174913)>

### Data dump

In [21]:
# data = embedded.toarray()
target = data["sentiment"]

with open(DATA_ROOT / "processed/imdb" / TARGET_DATA.replace(".csv", ".npz"), "xb") as f:
    np.savez(
        f,
        data=embedded,
        target=target,
        spatial=embedded,
    ) 

In [17]:
embedded.shape

(50000, 174913)

In [18]:
data

Unnamed: 0,review,sentiment
0,one review ha mention watch oz episod youll ho...,1
1,wonder littl product film techniqu veri unassu...,1
2,thought thi wa wonder way spend time hot summe...,1
3,basic famili littl boy jake think zombi hi clo...,0
4,petter mattei love time money visual stun film...,1
...,...,...
49995,thought thi movi right good job wasnt creativ ...,1
49996,bad plot bad dialogu bad act idiot direct anno...,0
49997,cathol taught parochi elementari school nun ta...,0
49998,im go disagre previou comment side maltin thi ...,0


In [None]:
embedded.shape

(50000, 174913)