In [1]:
import os
from pathlib import Path
import sys
import tarfile
import time
import numpy as np
import pandas as pd
from packaging import version
from tqdm import tqdm
import urllib

from local_dataset_utilities import reporthook

In [4]:
# pathlib.Path(__file__).resolve() # For python file
NBS_DIR = Path().resolve()
# REPO_DIR = NBS_DIR.parent
REPO_DIR = NBS_DIR
REPO_DIR
DATA_DIR = REPO_DIR / 'assets'
IMDB_DIR = DATA_DIR / 'IMDB'
# INPUT_DIR = DATA_DIR / 'inputs'
# READY_DIR = DATA_DIR / 'ready'
# OUTPUT_DIR = DATA_DIR / 'outputs'
# GENERATED_DIR = DATA_DIR / 'generated'

In [2]:
def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    if duration == 0:
        return
    progress_size = int(count * block_size)
    speed = progress_size / (1024.0**2 * duration)
    percent = count * block_size * 100.0 / total_size

    sys.stdout.write(
        f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
        f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
    )
    sys.stdout.flush()

In [5]:
# ASSETS_DIR = './assets/'
# IMDB_DIR = ASSETS_DIR + 'IMDB/'
TARGET = IMDB_DIR / 'Imdb_v1.tar.gz'

In [8]:

source = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = TARGET

# Create target directory if it doesn't exist
if not os.path.exists(os.path.dirname(target)):
    os.makedirs(os.path.dirname(target))

if os.path.exists(target):
    os.remove(target)

if not os.path.isdir(IMDB_DIR) and not os.path.isfile(target):
    urllib.request.urlretrieve(source, target, reporthook)

if not os.path.isdir(IMDB_DIR):
    with tarfile.open(target, "r:gz") as tar:
        tar.extractall(IMDB_DIR)


100% | 80.23 MB | 1.78 MB/s | 45.10 sec elapsed

In [7]:
os.path.isdir("IMDB")

False

In [9]:
if not os.path.exists(os.path.dirname(target)):
    os.makedirs(os.path.dirname(IMDB_DIR))

In [11]:
if not os.path.isdir(IMDB_DIR):
    with tarfile.open(target, "r:gz") as tar:
        tar.extractall(IMDB_DIR)

Let's fix the extraction path issue by checking the actual directory structure and modifying the extraction code

In [12]:
print("Current directory:", os.getcwd())
print("IMDB_DIR path:", os.path.abspath(IMDB_DIR))
print("Target file exists:", os.path.exists(target))

Current directory: /Users/slackroo/Data_science/Deeplearning/seb_pytorch
IMDB_DIR path: /Users/slackroo/Data_science/Deeplearning/seb_pytorch/assets/IMDB
Target file exists: True


In [13]:
# Create IMDB_DIR if it doesn't exist
# **secure tar file extraction** to prevent directory traversal attacks (also known as "Zip Slip" or "Tar Slip" attacks).

if not os.path.exists(IMDB_DIR):
    os.makedirs(IMDB_DIR)

# Extract to IMDB_DIR
if not os.path.isdir(os.path.join(IMDB_DIR, 'aclImdb')):
    with tarfile.open(target, "r:gz") as tar:
        # Extract all files while preserving directory structure
        def is_within_directory(directory, target):
            abs_directory = os.path.abspath(directory)
            abs_target = os.path.abspath(target)
            prefix = os.path.commonprefix([abs_directory, abs_target])
            return prefix == abs_directory


        def safe_extract(tar, path):
            for member in tar.getmembers():
                member_path = os.path.join(path, member.name)
                if not is_within_directory(path, member_path):
                    continue
                tar.extract(member, path)


        safe_extract(tar, IMDB_DIR)

In [15]:
os.listdir(IMDB_DIR+'aclImdb/train/pos/')

['4715_9.txt',
 '12390_8.txt',
 '8329_7.txt',
 '9063_8.txt',
 '3092_10.txt',
 '9865_8.txt',
 '6639_10.txt',
 '10460_10.txt',
 '10331_10.txt',
 '11606_10.txt',
 '6168_10.txt',
 '2712_10.txt',
 '3225_10.txt',
 '3574_10.txt',
 '3192_10.txt',
 '716_10.txt',
 '2612_10.txt',
 '5568_8.txt',
 '6554_7.txt',
 '1807_7.txt',
 '3474_10.txt',
 '11057_10.txt',
 '10231_10.txt',
 '11706_10.txt',
 '11167_9.txt',
 '803_10.txt',
 '5245_8.txt',
 '7935_8.txt',
 '835_8.txt',
 '6970_8.txt',
 '9533_9.txt',
 '5393_10.txt',
 '3384_8.txt',
 '6935_8.txt',
 '4342_10.txt',
 '9576_9.txt',
 '11913_10.txt',
 '5124_10.txt',
 '10961_9.txt',
 '2835_7.txt',
 '9919_9.txt',
 '9177_10.txt',
 '6762_9.txt',
 '12091_8.txt',
 '4414_9.txt',
 '9491_10.txt',
 '6115_10.txt',
 '7422_10.txt',
 '3258_10.txt',
 '6727_9.txt',
 '9327_8.txt',
 '4451_9.txt',
 '3509_10.txt',
 '2389_10.txt',
 '1936_10.txt',
 '6284_7.txt',
 '7727_9.txt',
 '1794_7.txt',
 '2902_9.txt',
 '3409_10.txt',
 '9591_10.txt',
 '2379_8.txt',
 '6655_7.txt',
 '58_9.txt',
 '1

In [16]:
# Verify extraction
print("Contents of IMDB_DIR:")
print(os.listdir(IMDB_DIR))


Contents of IMDB_DIR:
['.DS_Store', 'aclImdb', 'Imdb_v1.tar.gz']


In [7]:
basepath = IMDB_DIR/'aclImdb'
labels = {"pos":1, "neg":0}
df = pd.DataFrame()
with tqdm(total=50000) as pbar:
    for s in ("train",'test'):
        for l in ("pos","neg"):
            path = os.path.join(basepath,s,l)
            for file in sorted(os.listdir(path)):
                with open(os.path.join(path,file), 'r', encoding="utf-8") as infile:
                    txt = infile.read()

                if version.parse(pd.__version__) >= version.parse("1.3.2"):
                    x = pd.DataFrame(
                        [[txt, labels[l]]], columns=['review','sentiment']
                    )
                    df = pd.concat([df,x], ignore_index=False)
                else:
                    df = df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()

df.columns = ['text','label']

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))


print("class distribution:", np.bincount(df['label'].values))

100%|██████████| 50000/50000 [00:51<00:00, 974.29it/s] 

class distribution: [25000 25000]





In [21]:
pd.__version__

'2.3.0'

In [8]:
len(df)

50000

In [28]:
df[:20]

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
0,Homelessness (or Houselessness as George Carli...,1
0,Brilliant over-acting by Lesley Ann Warren. Be...,1
0,This is easily the most underrated film inn th...,1
0,This is not the typical Mel Brooks film. It wa...,1
0,"This isn't the comedic Robin Williams, nor is ...",1
0,Yes its an art... to successfully make a slow ...,1
0,"In this ""critically acclaimed psychological th...",1
0,THE NIGHT LISTENER (2006) **1/2 Robin Williams...,1
0,"You know, Robin Williams, God bless him, is co...",1


In [32]:
len(df)*.8

40000.0

In [9]:
df_shuffled = df.sample(frac=1, random_state=1).reset_index()
df_train = df_shuffled[:35_000]
df_val = df_shuffled[35_000:40_000]
df_test = df_shuffled[40_000:]

df_train.to_csv(IMDB_DIR / 'train.csv', index=False, encoding='utf-8')
df_val.to_csv(IMDB_DIR / 'val.csv' , index = False, encoding='utf-8')
df_test.to_csv(IMDB_DIR / 'test.csv', index=False, encoding='utf-8')
