In [1]:
# original dataset not included use (please download first)
# src: ai.stanford.edu/~amaas/data/sentiment/
INPUT_DIR: str = '../data/_aclImdb/'
EXPORT_DIR: str =  '../data/imdb'

In [2]:
from typing import Dict
import os

import numpy as np
import pandas as pd


def load_train_test_imdb_data(data_dir: str) -> Dict[str, pd.DataFrame]:
    """Loads the IMDB train/test datasets from a folder path.
    src: Shiva Krishna Gajavelli
    https://github.com/shivakrishna2497/Sentiment-Analysis-of-IMDB-Movie-Reviews/blob/master/

    Args:
        data_dir: path to the "aclImdb" folder.

    Returns:
        dict containing train/test datasets as pandas dataframes.

    """

    data: dict = {}

    for split in ["train", "test"]:
        data[split] = []

        for sentiment in ["neg", "pos"]:
            score = "positive" if sentiment == "pos" else "negative"

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)

            for f_name in file_names:
                with open(os.path.join(path, f_name), "r") as f:

                    review = f.read()
                    data[split].append([review, score])

        np.random.shuffle(data[split])
        data[split] = pd.DataFrame(data[split], columns=['text', 'sentiment'])

    return data

In [3]:
datasets: dict = load_train_test_imdb_data(INPUT_DIR)

In [4]:
for label, data in datasets.items():
    display(label, data)
    display(data['sentiment'].value_counts(normalize=True))

'train'

Unnamed: 0,text,sentiment
0,The location of the shop around the corner is ...,positive
1,As a horse lover one can only appreciate this ...,positive
2,"As gently as I can, I sincerely believe this m...",negative
3,I saw a preview of Freebird at the Isle of Man...,positive
4,"""Chinese Ghost Story"" is one of the most amazi...",positive
...,...,...
24995,I screamed my head off because seeing this mov...,negative
24996,I really enjoyed The 60's. Not being of that g...,positive
24997,"Level One, Horror.<br /><br />When I saw this ...",positive
24998,Notice that all those that did not like and en...,positive


positive    0.5
negative    0.5
Name: sentiment, dtype: float64

'test'

Unnamed: 0,text,sentiment
0,The Movie is okay. Meaning that I don't regret...,negative
1,Albert Pyun presents his vision of the lost ci...,negative
2,This movie will undoubtably not go over well w...,positive
3,"While watching this film recently, I constantl...",positive
4,The cast was well picked. Pauly Shore is hilar...,positive
...,...,...
24995,scarlet coat like most revolution flicks wasnt...,positive
24996,The initiation to the local sport team involve...,negative
24997,Very poor quality and the acting is equally as...,negative
24998,If You can watch a film without worrying about...,negative


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

In [5]:
# save raw data to csv
for label, data in datasets.items():
    data.to_csv(f'{EXPORT_DIR}._raw.{label}.csv', index=False)