In [1]:
import pandas as pd

DATA_PATH: str = '../data/imdb'

# load raw dataset
raw: pd.DataFrame = pd.read_csv(f'{DATA_PATH}.train.csv')

In [2]:
# format: remove html tags, convert sentiment to category
formatted: pd.DataFrame = (
    pd.DataFrame()
    .assign(
        review=raw['review']
        .str.replace(r'<[^<]+?>', '', regex=True),
        sentiment=raw['sentiment']
        .astype('category')
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
formatted

Unnamed: 0,review,sentiment
0,For a movie that gets no respect there sure ar...,positive
1,Bizarre horror movie filled with famous faces ...,positive
2,"A solid, if unremarkable film. Matthau, as Ein...",positive
3,It's a strange feeling to sit alone in a theat...,positive
4,"You probably all already know this by now, but...",positive
...,...,...
24898,"My comments may be a bit of a spoiler, for wha...",negative
24899,"The ""saucy"" misadventures of four au pairs who...",negative
24900,"Oh, those Italians! Assuming that movies about...",negative
24901,Eight academy nominations? It's beyond belief....,negative


# Split into train, eval, test

In [3]:
def split(data: pd.DataFrame, sizes: list) -> dict:
    return {
    label:
        data[
        int(sizes[n - 1][1] * len(data)):
        int((sizes[n - 1][1] + size) * len(data))
        ]
        if n != 0 else
        data[:int(size * len(data))]
    for n, (label, size) in enumerate(sizes)
}

In [4]:
datasets = split(
    formatted, [
    ('train', 0.9),
    ('eval', 0.1),
])
datasets

{'train':                                                   review sentiment
 0      For a movie that gets no respect there sure ar...  positive
 1      Bizarre horror movie filled with famous faces ...  positive
 2      A solid, if unremarkable film. Matthau, as Ein...  positive
 3      It's a strange feeling to sit alone in a theat...  positive
 4      You probably all already know this by now, but...  positive
 ...                                                  ...       ...
 22407  Stifler, has finished running his naked mile a...  negative
 22408  Nowadays it is sort of a trend to look upon al...  negative
 22409  This is without a doubt one of the worst movie...  negative
 22410  There's a lot the matter with Helen and none o...  negative
 22411  I ordered this extremely rare and highly overr...  negative
 
 [22412 rows x 2 columns],
 'eval':                                                   review sentiment
 22412  This was a better than average movie I thought...  negative
 2

In [5]:
# save to csv
for label, data in datasets.items():
    data.to_csv(f'{DATA_PATH}.{label}.csv', index = False)

# Split into train, eval, test

In [3]:
def split(data: pd.DataFrame, sizes: list) -> dict:
    return {
    label:
        data[
        int(sizes[n - 1][1] * len(data)):
        int((sizes[n - 1][1] + size) * len(data))
        ]
        if n != 0 else
        data[:int(size * len(data))]
    for n, (label, size) in enumerate(sizes)
}

In [4]:
datasets = split(
    formatted, [
    ('train', 0.8),
    ('eval', 0.1),
    ('test', 0.1)
])
datasets

{'train':                                                   review sentiment
 0      One of the other reviewers has mentioned that ...  positive
 1      A wonderful little production. The filming tec...  positive
 2      I thought this was a wonderful way to spend ti...  positive
 3      Basically there's a family where a little boy ...  negative
 4      Petter Mattei's "Love in the Time of Money" is...  positive
 ...                                                  ...       ...
 39659  Dr. Lucio Fulci (Lucio Fulci) is a director of...  negative
 39660  Yes I admit I cried during this movie. It was ...  negative
 39661  I've bought certain films on disc even though ...  positive
 39662  The final pairing of Nelson Eddy and Jeanette ...  negative
 39663  I watch romantic comedies with some hesitation...  positive
 
 [39664 rows x 2 columns],
 'eval':                                                   review sentiment
 39664  I usually check out the MTV movie awards to wa...  negative
 3

In [5]:
# save to csv
for label, data in datasets.items():
    data.to_csv(f'{DATA_PATH}.{label}.csv', index = False)