In [1]:
# original dataset not included use (please download first)
# src: ai.stanford.edu/~amaas/data/sentiment/
INPUT_DIR: str = '../data/_aclImdb/'
EXPORT_DIR: str =  '../data/imdb'

In [2]:
from typing import Dict
import os

import numpy as np
import pandas as pd


def load_train_test_imdb_data(data_dir: str) -> Dict[str, pd.DataFrame]:
    """Loads the IMDB train/test datasets from a folder path.
    src: Shiva Krishna Gajavelli
    https://github.com/shivakrishna2497/Sentiment-Analysis-of-IMDB-Movie-Reviews/blob/master/

    Args:
        data_dir: path to the "aclImdb" folder.

    Returns:
        dict containing train/test datasets as pandas dataframes.

    """

    data: dict = {}

    for split in ["train", "test"]:
        data[split] = []

        for sentiment in ["neg", "pos"]:
            score = "positive" if sentiment == "pos" else "negative"

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)

            for f_name in file_names:
                with open(os.path.join(path, f_name), "r") as f:

                    review = f.read()
                    data[split].append([review, score])

        np.random.shuffle(data[split])
        data[split] = pd.DataFrame(data[split], columns=['text', 'sentiment'])

    return data

In [3]:
datasets: dict = load_train_test_imdb_data(INPUT_DIR)

In [4]:
for label, data in datasets.items():
    display(label, data)
    display(data['sentiment'].value_counts(normalize=True))

'train'

Unnamed: 0,text,sentiment
0,Here are the matches . . . (adv. = advantage)<...,negative
1,STAR RATING: ***** Saturday Night **** Friday ...,negative
2,The traditional Western is synonymous with wid...,positive
3,"Brilliant actor as he is, Al Pacino completely...",negative
4,This is the most recent addition to a new wave...,positive
...,...,...
24995,This film is a portrait of the half-spastic te...,negative
24996,A warning to you not to be seduced by the name...,negative
24997,I just saw this film last night at Toronto Fil...,positive
24998,Sadly it was misguided. This movie stunk from ...,negative


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

'test'

Unnamed: 0,text,sentiment
0,"Most likely ""Cleopatra 2525"" will be of little...",negative
1,Rita Hayworth plays a Brooklyn nightclub dance...,negative
2,"This film was okay, but like most TV series it...",negative
3,I saw the film yesterday and really enjoyed it...,positive
4,"I saw the movie in 1972, and like other people...",positive
...,...,...
24995,Mom has to be one of the all time uncomfortabl...,positive
24996,"At last, a film to rival 'El Padrino' and 'Dar...",negative
24997,If this film had been made in the 50's or 60's...,positive
24998,I opted to watch this film for one reason and ...,positive


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

In [5]:
# save raw data to csv
for label, data in datasets.items():
    data.to_csv(f'{EXPORT_DIR}._raw.{label}.csv', index=False)