In [1]:
import pandas as pd

DATA_PATH: str = '../data/imdb'

# load raw dataset
raw: pd.DataFrame = pd.read_csv(f'{DATA_PATH}._raw.csv')

In [2]:
# format: remove html tags, convert sentiment to category
formatted: pd.DataFrame = (
    pd.DataFrame()
    .assign(
        review=raw['review']
        .str.replace(r'<[^<]+?>', '', regex=True),
        sentiment=raw['sentiment']
        .astype('category')
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
formatted

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49576,I thought this movie did a down right good job...,positive
49577,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49578,I am a Catholic taught in parochial elementary...,negative
49579,I'm going to have to disagree with the previou...,negative


# Split into train, eval, test

In [3]:
def split(data: pd.DataFrame, sizes: list) -> dict:
    return {
    label:
        data[
        int(sizes[n - 1][1] * len(data)):
        int((sizes[n - 1][1] + size) * len(data))
        ]
        if n != 0 else
        data[:int(size * len(data))]
    for n, (label, size) in enumerate(sizes)
}

In [4]:
datasets = split(
    formatted, [
    ('train', 0.8),
    ('eval', 0.1),
    ('test', 0.1)
])
datasets

{'train':                                                   review sentiment
 0      One of the other reviewers has mentioned that ...  positive
 1      A wonderful little production. The filming tec...  positive
 2      I thought this was a wonderful way to spend ti...  positive
 3      Basically there's a family where a little boy ...  negative
 4      Petter Mattei's "Love in the Time of Money" is...  positive
 ...                                                  ...       ...
 39659  Dr. Lucio Fulci (Lucio Fulci) is a director of...  negative
 39660  Yes I admit I cried during this movie. It was ...  negative
 39661  I've bought certain films on disc even though ...  positive
 39662  The final pairing of Nelson Eddy and Jeanette ...  negative
 39663  I watch romantic comedies with some hesitation...  positive
 
 [39664 rows x 2 columns],
 'eval':                                                   review sentiment
 39664  I usually check out the MTV movie awards to wa...  negative
 3

In [5]:
# save to csv
for label, data in datasets.items():
    data.to_csv(f'{DATA_PATH}.{label}.csv', index = False)