In [1]:
INPUT_DIR: str = '../data/_aclImdb/'
EXPORT_DIR: str =  '../data/imdb'

In [2]:
from typing import Dict
import os

import numpy as np
import pandas as pd


def load_train_test_imdb_data(data_dir: str) -> Dict[str, pd.DataFrame]:
    """Loads the IMDB train/test datasets from a folder path.
    src: Shiva Krishna Gajavelli
    https://github.com/shivakrishna2497/Sentiment-Analysis-of-IMDB-Movie-Reviews/blob/master/

    Args:
        data_dir: path to the "aclImdb" folder.

    Returns:
        dict containing train/test datasets as pandas dataframes.

    """

    data: dict = {}

    for split in ["train", "test"]:
        data[split] = []

        for sentiment in ["neg", "pos"]:
            score = "positive" if sentiment == "pos" else "negative"

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)

            for f_name in file_names:
                with open(os.path.join(path, f_name), "r") as f:

                    review = f.read()
                    data[split].append([review, score])

        np.random.shuffle(data[split])
        data[split] = pd.DataFrame(data[split], columns=['text', 'sentiment'])

    return data

In [3]:
datasets: dict = load_train_test_imdb_data(INPUT_DIR)

In [4]:
for label, data in datasets.items():
    display(label, datasets[label])

'train'

Unnamed: 0,text,sentiment
0,"*Wonderland SPOILERS* <br /><br />July 1st, 19...",positive
1,"Neil Simon has quite a body of work, but it is...",positive
2,This movie is a riot. I cannot remember the la...,positive
3,I created my own reality by walking out of the...,negative
4,Spacecamp is one of the movies that kids just ...,positive
...,...,...
24995,I've always knew Anne DeSalvo was a great char...,positive
24996,I've read one comment which labeled this film ...,positive
24997,"Who would think Andy Griffith's ""Helen Crump"" ...",positive
24998,Watching this I mainly noticed the ad placemen...,negative


'test'

Unnamed: 0,text,sentiment
0,If you can watch a Bond film from 1983 that is...,positive
1,"""More"", maybe, is mostly remembered for the ex...",positive
2,What a cast of actors and actresses in this Co...,positive
3,I will admit I possibly missed tiny moments wh...,positive
4,his has to surely be one of the worst gay-them...,negative
...,...,...
24995,It amazes me that production companies will su...,positive
24996,In short this movie was awful.<br /><br />I un...,negative
24997,having never actually seen anything by this be...,positive
24998,"Okay, I haven't read the book yet but I have t...",negative


In [5]:
# save raw data to csv
for label, data in datasets.items():
    data.to_csv(f'{EXPORT_DIR}._{label}.csv', index=False)