In [1]:
DATA_PATH: str = '../data/imdb'
SAMPLE_SIZE: float = 0.05 # set sample size as fraction

In [2]:
from typing import Dict
import pandas as pd


# load raw dataset (not included, please use notebook 00 first)
datasets: Dict[str, pd.DataFrame] = {
    'train': pd.read_csv(f'{DATA_PATH}._raw.train.csv'),
    'test': pd.read_csv(f'{DATA_PATH}._raw.test.csv')
}

In [3]:
for label, data in datasets.items():
    # format: remove html tags, convert sentiment to category
    datasets[label]: pd.DataFrame = (
        pd.DataFrame()
        .assign(
            text=data['text']
            .str.replace(r'<[^<]+?>', '', regex=True),
            sentiment=data['sentiment']
            .astype('category')
        )
        .drop_duplicates()
        .reset_index(drop=True)
        .sample(frac=1) # shuffling
    )
    display(data)
    display(data['sentiment'].value_counts(normalize=True))


Unnamed: 0,text,sentiment
0,Manoj Agrawal after the failure of PARDESI BAB...,negative
1,"OK.... I just have 3 words - cheesy, cheesy an...",negative
2,The main reason I wanted to see this movie was...,positive
3,"From the Q & A before and after, this is what ...",negative
4,That was definitely the case with Angels in th...,positive
...,...,...
24995,"I'm not a huge Star Trek fan, but I was lookin...",negative
24996,I expected to enjoy a romantic comedy featurin...,negative
24997,Some comments here on IMDb have likened Dog Bi...,positive
24998,"I agree with all the accolades, I went through...",positive


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

Unnamed: 0,text,sentiment
0,This movie was astonishing how good it was! Th...,positive
1,"Normally, I don't like Chuck Norris films. I a...",negative
2,Le conseguenze dell'amore (2004)is a beautiful...,positive
3,Don't watch this movie expecting the Jane Aust...,positive
4,This is one of the funniest movies I have ever...,positive
...,...,...
24995,There were very few good moments in this film....,negative
24996,...and I'm so disappointed because I can't see...,positive
24997,Tim Robbins did a masterful job directing this...,positive
24998,"When i went to see this i thought, i liked the...",negative


positive    0.5
negative    0.5
Name: sentiment, dtype: float64

In [4]:
# save processed data to csv
for label, data in datasets.items():
    data.to_csv(f'{DATA_PATH}._prepped.{label}.csv', index=False)

In [5]:
# create, log and save sample to csv
for label, data in datasets.items():
    sample: pd.DataFrame = data.sample(frac=SAMPLE_SIZE)
    display(sample)
    display(sample['sentiment'].value_counts(normalize=True))

    sample.to_csv(f'{DATA_PATH}.sample.{label}.csv', index=False)

Unnamed: 0,text,sentiment
6139,Please Note: I see from the various posts that...,negative
22546,This is going to be the most useless comment I...,negative
20983,I was excited to hear that Cesar Montano had d...,negative
11664,No sense going over the story since enough rev...,positive
24020,One of my best friends brought this movie over...,negative
...,...,...
11925,"Hello there,This is my first post in IMDb even...",negative
2233,"On the surface, this movie would appear to dea...",negative
17733,'Iphigenia' is the great achievement of Michae...,positive
23008,I wanted to see Valentine ever since I saw tha...,positive


negative    0.505221
positive    0.494779
Name: sentiment, dtype: float64

Unnamed: 0,text,sentiment
5821,It is extremely rare that I see a movie from 1...,negative
1884,Let's cut through everything in the first para...,negative
22281,This for one has nothing to do with the absolu...,negative
9605,When I saw this movie i expected it to be a ch...,positive
539,We sat through this movie thinking why is this...,negative
...,...,...
326,Excellent entry in the RKO Saint series with w...,positive
382,"THE SEA INSIDE (2004) **** Javier Bardem, Bele...",positive
16178,Francis Ford Coppola wrote and directed this s...,positive
5224,"Well, the movie was no terrible, but whomever ...",negative


positive    0.516935
negative    0.483065
Name: sentiment, dtype: float64