In [1]:
# original dataset not included use (please download first)
# src: ai.stanford.edu/~amaas/data/sentiment/
INPUT_DIR: str = '../data/_aclImdb/'
EXPORT_DIR: str =  '../data/imdb'

In [2]:
from typing import Dict
import os

import numpy as np
import pandas as pd


def load_train_test_imdb_data(data_dir: str) -> Dict[str, pd.DataFrame]:
    """Loads the IMDB train/test datasets from a folder path.
    src: Shiva Krishna Gajavelli
    https://github.com/shivakrishna2497/Sentiment-Analysis-of-IMDB-Movie-Reviews/blob/master/

    Args:
        data_dir: path to the "aclImdb" folder.

    Returns:
        dict containing train/test datasets as pandas dataframes.

    """

    data: dict = {}

    for split in ["train", "test"]:
        data[split] = []

        for sentiment in ["neg", "pos"]:
            score = "positive" if sentiment == "pos" else "negative"

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)

            for f_name in file_names:
                with open(os.path.join(path, f_name), "r") as f:

                    review = f.read()
                    data[split].append([review, score])

        np.random.shuffle(data[split])
        data[split] = pd.DataFrame(data[split], columns=['text', 'sentiment'])

    return data

In [3]:
datasets: dict = load_train_test_imdb_data(INPUT_DIR)

In [4]:
for label, data in datasets.items():
    display(label, data)
    display(data['sentiment'].value_counts(normalize=True))

'train'

Unnamed: 0,text,sentiment
0,Manoj Agrawal after the failure of PARDESI BAB...,negative
1,"OK.... I just have 3 words - cheesy, cheesy an...",negative
2,The main reason I wanted to see this movie was...,positive
3,"From the Q & A before and after, this is what ...",negative
4,That was definitely the case with Angels in th...,positive
...,...,...
24995,"I'm not a huge Star Trek fan, but I was lookin...",negative
24996,I expected to enjoy a romantic comedy featurin...,negative
24997,Some comments here on IMDb have likened Dog Bi...,positive
24998,"I agree with all the accolades, I went through...",positive


negative    0.5
positive    0.5
Name: sentiment, dtype: float64

'test'

Unnamed: 0,text,sentiment
0,This movie was astonishing how good it was! Th...,positive
1,"Normally, I don't like Chuck Norris films. I a...",negative
2,Le conseguenze dell'amore (2004)is a beautiful...,positive
3,Don't watch this movie expecting the Jane Aust...,positive
4,This is one of the funniest movies I have ever...,positive
...,...,...
24995,There were very few good moments in this film....,negative
24996,...and I'm so disappointed because I can't see...,positive
24997,Tim Robbins did a masterful job directing this...,positive
24998,"When i went to see this i thought, i liked the...",negative


positive    0.5
negative    0.5
Name: sentiment, dtype: float64

In [5]:
# save raw data to csv
for label, data in datasets.items():
    data.to_csv(f'{EXPORT_DIR}._raw.{label}.csv', index=False)