In [1]:
DATA_PATH: str = '../data/imdb'
SAMPLE_SIZE: float = 0.05 # set sample size as fraction

In [2]:
from typing import Dict
import pandas as pd


# load raw dataset (not included, please use notebook 00 first)
datasets: Dict[str, pd.DataFrame] = {
    'train': pd.read_csv(f'{DATA_PATH}._raw.train.csv'),
    'test': pd.read_csv(f'{DATA_PATH}._raw.test.csv')
}

In [3]:
for label, data in datasets.items():
    # format: remove html tags, convert sentiment to category
    datasets[label]: pd.DataFrame = (
        pd.DataFrame()
        .assign(
            text=data['text']
            .str.replace(r'<[^<]+?>', '', regex=True),
            sentiment=data['sentiment']
            .astype('category')
        )
        .drop_duplicates()
        .reset_index(drop=True)
        .sample(frac=1) # shuffling
    )
    display(data)
    display(data['sentiment'].value_counts(normalize=True))


Unnamed: 0,text,sentiment
0,The location of the shop around the corner is ...,positive
1,As a horse lover one can only appreciate this ...,positive
2,"As gently as I can, I sincerely believe this m...",negative
3,I saw a preview of Freebird at the Isle of Man...,positive
4,"""Chinese Ghost Story"" is one of the most amazi...",positive
...,...,...
24995,I screamed my head off because seeing this mov...,negative
24996,I really enjoyed The 60's. Not being of that g...,positive
24997,"Level One, Horror.<br /><br />When I saw this ...",positive
24998,Notice that all those that did not like and en...,positive


positive    0.5
negative    0.5
Name: sentiment, dtype: float64

Unnamed: 0,text,sentiment
0,The Movie is okay. Meaning that I don't regret...,negative
1,Albert Pyun presents his vision of the lost ci...,negative
2,This movie will undoubtably not go over well w...,positive
3,"While watching this film recently, I constantl...",positive
4,The cast was well picked. Pauly Shore is hilar...,positive
...,...,...
24995,scarlet coat like most revolution flicks wasnt...,positive
24996,The initiation to the local sport team involve...,negative
24997,Very poor quality and the acting is equally as...,negative
24998,If You can watch a film without worrying about...,negative


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

In [4]:
# save processed data to csv
for label, data in datasets.items():
    data.to_csv(f'{DATA_PATH}._prepped.{label}.csv', index=False)

In [5]:
# create, log and save sample to csv
for label, data in datasets.items():
    sample: pd.DataFrame = data.sample(frac=SAMPLE_SIZE)
    display(sample)
    display(sample['sentiment'].value_counts(normalize=True))

    sample.to_csv(f'{DATA_PATH}.sample.{label}.csv', index=False)

Unnamed: 0,text,sentiment
18954,"And it falls squarely into the category of ""aw...",negative
9428,This is one seriously disturbed movie. Even Th...,negative
2506,"Basically this is an overlong, unfunny, action...",negative
22571,Hey if you have a little over an hour to kill ...,negative
12076,Did anyone read the script. This has to be som...,negative
...,...,...
9051,"First of all, Jenna Jameson is the best actres...",negative
18335,"I didnt think it was possible, but i have foun...",negative
4134,"OK, I taped this off TV and missed the very st...",negative
20869,"Okay, okay, maybe not THE greatest. I mean, Th...",positive


negative    0.503614
positive    0.496386
Name: sentiment, dtype: float64

Unnamed: 0,text,sentiment
17948,This...... Movie.... Is..... Horrible!!!!!! Yo...,negative
7385,At the same time John Russell was playing ranc...,positive
18262,This is the best version of Gypsy that has bee...,positive
14364,"It's just stories, some we wish happen to us, ...",positive
11275,"This film, without doubt, is the clearest exam...",positive
...,...,...
22266,"""Cooley High"" is one of my favorite movies EVE...",positive
24441,The Comic Strip featured actors from 'The Youn...,negative
11776,I suppose you could say this film has a grain ...,negative
23526,"Having just watched Acacia, I find that I have...",negative


negative    0.520968
positive    0.479032
Name: sentiment, dtype: float64