In [1]:
# original dataset not included use (please download first)
# src: ai.stanford.edu/~amaas/data/sentiment/
INPUT_DIR: str = '../data/_aclImdb/'
EXPORT_DIR: str =  '../data/imdb'

In [2]:
from typing import Dict
import os

import numpy as np
import pandas as pd


def load_train_test_imdb_data(data_dir: str) -> Dict[str, pd.DataFrame]:
    """Loads the IMDB train/test datasets from a folder path.
    src: Shiva Krishna Gajavelli
    https://github.com/shivakrishna2497/Sentiment-Analysis-of-IMDB-Movie-Reviews/blob/master/

    Args:
        data_dir: path to the "aclImdb" folder.

    Returns:
        dict containing train/test datasets as pandas dataframes.

    """

    data: dict = {}

    for split in ["train", "test"]:
        data[split] = []

        for sentiment in ["neg", "pos"]:
            score = "positive" if sentiment == "pos" else "negative"

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)

            for f_name in file_names:
                with open(os.path.join(path, f_name), "r") as f:

                    review = f.read()
                    data[split].append([review, score])

        np.random.shuffle(data[split])
        data[split] = pd.DataFrame(data[split], columns=['text', 'sentiment'])

    return data

In [3]:
datasets: dict = load_train_test_imdb_data(INPUT_DIR)

In [4]:
for label, data in datasets.items():
    display(label, data)
    display(data['sentiment'].value_counts(normalize=True))

'train'

Unnamed: 0,text,sentiment
0,I love this show and my 11 year-old daughter a...,positive
1,I watched the movie while recovering from majo...,positive
2,blows my mind how this movie got made. i watch...,negative
3,Soapdish may go down as one of the single most...,positive
4,I have to say that Grand Canyon is one of the ...,positive
...,...,...
24995,"Well, I had seen ""They all laughed"" when it ca...",positive
24996,I love all his work but this looks like nothin...,negative
24997,This film was on last week and although at tha...,positive
24998,A lot of themes or parts of the story is the s...,negative


positive    0.5
negative    0.5
Name: sentiment, dtype: float64

'test'

Unnamed: 0,text,sentiment
0,"Awful, dreadful, terrible. The actors are bad,...",negative
1,"Okay, so I love silly movies. If you enjoy sil...",positive
2,This film should never have been made! It stin...,negative
3,For those of you unfamiliar with Alisdair Sims...,positive
4,The film was half over before I managed to fig...,negative
...,...,...
24995,"Based on actual events of 1905, silent film TH...",positive
24996,This is one of the better sci-fi series. It in...,positive
24997,"A lovely librarian, played by Playboy model Kr...",positive
24998,This era was not just the dawn of sound in car...,positive


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

In [5]:
# save raw data to csv
for label, data in datasets.items():
    data.to_csv(f'{EXPORT_DIR}._raw.{label}.csv', index=False)