In [1]:
DATA_PATH: str = '../data/imdb'
SAMPLE_SIZE: float = 0.05 # set sample size as fraction

In [2]:
from typing import Dict
import pandas as pd


# load raw dataset (not included, please use notebook 00 first)
datasets: Dict[str, pd.DataFrame] = {
    'train': pd.read_csv(f'{DATA_PATH}._raw.train.csv'),
    'test': pd.read_csv(f'{DATA_PATH}._raw.test.csv')
}

In [3]:
for label, data in datasets.items():
    # format: remove html tags, convert sentiment to category
    datasets[label]: pd.DataFrame = (
        pd.DataFrame()
        .assign(
            text=data['text']
            .str.replace(r'<[^<]+?>', '', regex=True),
            sentiment=data['sentiment']
            .astype('category')
        )
        .drop_duplicates()
        .reset_index(drop=True)
        .sample(frac=1) # shuffling
    )
    display(data)
    display(data['sentiment'].value_counts(normalize=True))


Unnamed: 0,text,sentiment
0,I love this show and my 11 year-old daughter a...,positive
1,I watched the movie while recovering from majo...,positive
2,blows my mind how this movie got made. i watch...,negative
3,Soapdish may go down as one of the single most...,positive
4,I have to say that Grand Canyon is one of the ...,positive
...,...,...
24995,"Well, I had seen ""They all laughed"" when it ca...",positive
24996,I love all his work but this looks like nothin...,negative
24997,This film was on last week and although at tha...,positive
24998,A lot of themes or parts of the story is the s...,negative


positive    0.5
negative    0.5
Name: sentiment, dtype: float64

Unnamed: 0,text,sentiment
0,"Awful, dreadful, terrible. The actors are bad,...",negative
1,"Okay, so I love silly movies. If you enjoy sil...",positive
2,This film should never have been made! It stin...,negative
3,For those of you unfamiliar with Alisdair Sims...,positive
4,The film was half over before I managed to fig...,negative
...,...,...
24995,"Based on actual events of 1905, silent film TH...",positive
24996,This is one of the better sci-fi series. It in...,positive
24997,"A lovely librarian, played by Playboy model Kr...",positive
24998,This era was not just the dawn of sound in car...,positive


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

In [4]:
# save processed data to csv
for label, data in datasets.items():
    data.to_csv(f'{DATA_PATH}._prepped.{label}.csv', index=False)

In [5]:
# create, log and save sample to csv
for label, data in datasets.items():
    sample: pd.DataFrame = data.sample(frac=SAMPLE_SIZE)
    display(sample)
    display(sample['sentiment'].value_counts(normalize=True))

    sample.to_csv(f'{DATA_PATH}.sample.{label}.csv', index=False)

Unnamed: 0,text,sentiment
12115,Aside from the horrendous acting and the ridic...,negative
13716,Can such an ambient production have failed its...,positive
5278,Oh-so-familiar comedy story about low-key nice...,negative
4213,"In the beginning of this film, one of the comm...",positive
4790,"Yes, some plots are a bit hard to follow, and ...",positive
...,...,...
12693,"I have watched 3 episodes of Caveman, and I ha...",negative
3595,"It's a very good movie, not only for the fans ...",positive
9869,"Modern, original, romantic story.Very good act...",positive
9320,The Straight Story is a multilevel exploration...,positive


positive    0.519679
negative    0.480321
Name: sentiment, dtype: float64

Unnamed: 0,text,sentiment
2916,"Out of the top 24 lesbian films in my library,...",positive
16921,I went to see this movie with a lady freind of...,positive
14291,I have to say that I really liked UNDER SIEGE ...,negative
8422,I paid attention and enjoyed the very rich exp...,positive
24670,"A bad Quentin Tarantino rip off, at least I ho...",negative
...,...,...
20187,A group of obnoxious teens go to a former fune...,positive
2092,This is one of the movies having made signific...,positive
1420,I approached this movie with the understanding...,negative
11113,I watched fantabulosa! because over the last f...,positive


positive    0.504839
negative    0.495161
Name: sentiment, dtype: float64