In [1]:
import pandas as pd

DATA_PATH: str = '../data/imdb'

# load raw dataset
raw: pd.DataFrame = pd.read_csv(f'{DATA_PATH}.train.csv')

In [3]:
# format: remove html tags, convert sentiment to category
formatted: pd.DataFrame = (
    pd.DataFrame()
    .assign(
        review=raw['review']
        .str.replace(r'<[^<]+?>', '', regex=True),
        sentiment=raw['sentiment']
        .astype('category')
    )
    .drop_duplicates()
    .reset_index(drop=True)
    .sample(frac=1)
)
formatted

Unnamed: 0,review,sentiment
10449,John Carpenter's Halloween is quite frankly a ...,positive
17483,This is a strange sex comedy because there`s v...,negative
7717,My friends and I were just discussing how frus...,positive
19830,"The premise is ridiculous, the characters unbe...",negative
381,I enjoyed every moment of this beautiful film ...,positive
...,...,...
4998,"""Ah Ritchie's made another gangster film with ...",positive
20363,I always thought people were a little too cyni...,negative
13720,PROBLEM CHILD is one of the worst movies I hav...,negative
18306,I saw this movie a few months ago in the town ...,negative


# Split into train, eval, test

In [4]:
def split(data: pd.DataFrame, sizes: list) -> dict:
    return {
    label:
        data[
        int(sizes[n - 1][1] * len(data)):
        int((sizes[n - 1][1] + size) * len(data))
        ]
        if n != 0 else
        data[:int(size * len(data))]
    for n, (label, size) in enumerate(sizes)
}

In [5]:
datasets = split(
    formatted, [
    ('train', 0.9),
    ('eval', 0.1),
])
datasets

{'train':                                                   review sentiment
 10449  John Carpenter's Halloween is quite frankly a ...  positive
 17483  This is a strange sex comedy because there`s v...  negative
 7717   My friends and I were just discussing how frus...  positive
 19830  The premise is ridiculous, the characters unbe...  negative
 381    I enjoyed every moment of this beautiful film ...  positive
 ...                                                  ...       ...
 21133  The plot of 'Edison' was decent, but one actor...  negative
 20333  Amateurism best describes the film adaptation ...  negative
 14285  After having seen the movie the first question...  negative
 2207   I was taken to this film by a friend and was s...  positive
 14535  Sholay: Considered to be one of the greatest f...  negative
 
 [20170 rows x 2 columns],
 'eval':                                                   review sentiment
 22224  I very nearly walked out, but I'd paid my mone...  negative
 2

In [10]:
from IPython.core.display_functions import display

# save to csv
for label, data in datasets.items():
    display(data['sentiment'].value_counts(normalize=True))

positive    0.557362
negative    0.442638
Name: sentiment, dtype: float64

positive    0.548617
negative    0.451383
Name: sentiment, dtype: float64

In [11]:
# save to csv
for label, data in datasets.items():
    data.to_csv(f'{DATA_PATH}.{label}.csv', index = False)