# IMDB

In [38]:
import os
import tarfile
import numpy as np
import pandas as pd


# Paths and Variables

In [39]:
dataset_name = "imdb"

In [40]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
os.makedirs(output_dir, exist_ok=True)
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

In [41]:
def extract_tar_gz(tar_path, destination):
    """
    Extracts a .tar.gz archive to a specified destination directory.
    
    Parameters:
        tar_path (str): The path to the .tar.gz file.
        destination (str): The directory where the contents will be extracted.
    """
    # Open the tar.gz file
    with tarfile.open(tar_path, 'r:gz') as tar:
        # Extract all contents to the destination directory
        tar.extractall(path=destination)
        print(f'Extracted all contents to {destination}')


In [42]:
extract_tar_gz(os.path.join(input_dir, 'aclImdb_v1.tar.gz'), input_dir)

Extracted all contents to ./data


In [43]:
classes = ['pos', 'neg'] 

In [44]:
id_col = "id"
target_col = "class"
text_col = "text"

# Read data into a DataFrame

In [45]:
train_dir = os.path.join(input_dir, 'aclimdb', 'train')
test_dir = os.path.join(input_dir, 'aclimdb', 'test')

dirs = [train_dir, test_dir]
for dir in dirs:
    all_data = []
    for class_ in classes:
        dir_path = os.path.join(dir, class_)
        files = os.listdir(dir_path)
        print(class_, len(files))
        
        for f in files: 
            file_input_path = os.path.join(dir, class_, f)
            
            with open(file_input_path, encoding="utf-8") as inp_f:
                text = inp_f.read()
            all_data.append([f, class_, text])

    print(len(all_data))

    if dir == train_dir:
        data_train = pd.DataFrame(all_data, columns=[id_col, target_col, text_col])
    else:
        data_test = pd.DataFrame(all_data, columns=[id_col, target_col, text_col])

print(data_train.head())

pos 12500
neg 12500
25000
pos 12500
neg 12500
25000
            id class                                               text
0   4715_9.txt   pos  For a movie that gets no respect there sure ar...
1  12390_8.txt   pos  Bizarre horror movie filled with famous faces ...
2   8329_7.txt   pos  A solid, if unremarkable film. Matthau, as Ein...
3   9063_8.txt   pos  It's a strange feeling to sit alone in a theat...
4  3092_10.txt   pos  You probably all already know this by now, but...


# Shuffle Data

In [46]:
# shuffle data
data = data_train.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,class,text
6868,11620_8.txt,pos,Great little thriller. I was expecting some ty...
24016,9004_1.txt,neg,"Nothing could have saved this movie, not even ..."
9668,3718_8.txt,pos,This was a good movie. It wasn't your typical ...
13640,5836_2.txt,neg,From the pen of Richard Condon (The Manchurian...
14018,902_4.txt,neg,I suppose that today this film has relevance b...


# Insert Id Column

In [47]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

# Utility to Save DF as a zipped file

In [48]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

In [49]:
data = pd.concat([data_train, data_test], axis=0)
data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])

# Save Main Data File

In [50]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

In [51]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")