In [1]:
DATA_PATH: str = '../data/imdb'
SAMPLE_SIZE: float = 0.05 # set sample size as fraction

In [2]:
from typing import Dict
import pandas as pd


# load raw dataset (not included, please use notebook 00 first)
datasets: Dict[str, pd.DataFrame] = {
    'train': pd.read_csv(f'{DATA_PATH}._raw.train.csv'),
    'test': pd.read_csv(f'{DATA_PATH}._raw.test.csv')
}

In [3]:
for label, data in datasets.items():
    # format: remove html tags, convert sentiment to category
    datasets[label]: pd.DataFrame = (
        pd.DataFrame()
        .assign(
            text=data['text']
            .str.replace(r'<[^<]+?>', '', regex=True),
            sentiment=data['sentiment']
            .astype('category')
        )
        .drop_duplicates()
        .reset_index(drop=True)
        .sample(frac=1) # shuffling
    )
    display(data)
    display(data['sentiment'].value_counts(normalize=True))


Unnamed: 0,text,sentiment
0,Here are the matches . . . (adv. = advantage)<...,negative
1,STAR RATING: ***** Saturday Night **** Friday ...,negative
2,The traditional Western is synonymous with wid...,positive
3,"Brilliant actor as he is, Al Pacino completely...",negative
4,This is the most recent addition to a new wave...,positive
...,...,...
24995,This film is a portrait of the half-spastic te...,negative
24996,A warning to you not to be seduced by the name...,negative
24997,I just saw this film last night at Toronto Fil...,positive
24998,Sadly it was misguided. This movie stunk from ...,negative


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

Unnamed: 0,text,sentiment
0,"Most likely ""Cleopatra 2525"" will be of little...",negative
1,Rita Hayworth plays a Brooklyn nightclub dance...,negative
2,"This film was okay, but like most TV series it...",negative
3,I saw the film yesterday and really enjoyed it...,positive
4,"I saw the movie in 1972, and like other people...",positive
...,...,...
24995,Mom has to be one of the all time uncomfortabl...,positive
24996,"At last, a film to rival 'El Padrino' and 'Dar...",negative
24997,If this film had been made in the 50's or 60's...,positive
24998,I opted to watch this film for one reason and ...,positive


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

In [4]:
# save processed data to csv
for label, data in datasets.items():
    data.to_csv(f'{DATA_PATH}._prepped.{label}.csv', index=False)

In [5]:
# create, log and save sample to csv
for label, data in datasets.items():
    sample: pd.DataFrame = data.sample(frac=SAMPLE_SIZE)
    display(sample)
    display(sample['sentiment'].value_counts(normalize=True))

    sample.to_csv(f'{DATA_PATH}.sample.{label}.csv', index=False)

Unnamed: 0,text,sentiment
13935,The acting in this movie was superb. As an ama...,positive
8327,Watching That Lady In Ermine I was wondering w...,negative
8657,"Why?!! This was an insipid, uninspired and emb...",negative
21062,"Although the plot was a bit sappy at times, an...",positive
20445,This is one of the few comedies I can watch ag...,positive
...,...,...
21972,This film has a special place in my heart as t...,negative
20389,OK me and a friend rented this a few days ago ...,negative
2598,Five-year-old Michael sees his mother getting ...,negative
14955,Every movie critic and metal head hated this m...,positive


positive    0.515663
negative    0.484337
Name: sentiment, dtype: float64

Unnamed: 0,text,sentiment
19259,one of my favorite lines in Shakespeare.i.e. *...,positive
17256,A poor basketball movie. A gruff coach with a ...,negative
9728,"Based on the book ""Space Vampires"" by Colin Wi...",positive
24096,There are about ten minutes about half way thr...,negative
22344,>>> Great News there is a BBC DVD release sche...,positive
...,...,...
14852,Sheba Shayne (Pam Grier) receives a telegram i...,negative
14899,"On one level, Hari Om is a film using a famili...",positive
17820,"OK, so I gotta start this review by saying i w...",negative
6409,My daughter already wrote a review of this mov...,positive


negative    0.514516
positive    0.485484
Name: sentiment, dtype: float64