In [4]:
import os
import requests
import zipfile
from tqdm import tqdm
from pathlib import Path

# Directory to store downloaded and extracted data
DATA_DIR = Path("./mimic_textbooks")

# Step 1: Download and extract the dataset zip file
def download_and_extract_zip(url, extract_to=DATA_DIR):
    # Ensure the directory exists
    extract_to.mkdir(parents=True, exist_ok=True)

    # Download the zip file
    zip_path = extract_to / "textbooks.zip"
    print("Downloading dataset...")
    response = requests.get(url, stream=True)
    with open(zip_path, "wb") as file:
        for chunk in tqdm(response.iter_content(chunk_size=1024), unit='KB'):
            if chunk:
                file.write(chunk)

    # Extract the zip file
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
    print("Dataset downloaded and extracted.")

# Step 2: Load and process text files
def load_text_files(directory):
    text_data = []
    file_paths = []

    # Use pathlib.Path to dynamically list all .txt files in directory
    for filepath in Path(directory).rglob("*.txt"):
        with open(filepath, "r", encoding="utf-8") as file:
            text = file.read()
            text_data.append(text)
            file_paths.append(filepath)

    return text_data, file_paths

# Execute the download, extraction, and loading setup
def setup_environment():
    # Zip file URL
    dataset_url = "https://www.dropbox.com/scl/fi/54p9kkx5n93bffyx08eba/textbooks.zip?rlkey=2y2c5x8y0uncnddichn9cmd7n&st=m290nmkk&dl=1"

    # Step 1: Download and extract data files
    download_and_extract_zip(dataset_url)

    # Step 2: Load text files from the extracted dataset
    text_files_directory = DATA_DIR / "textbooks/en"
    texts, paths = load_text_files(text_files_directory)

    print("Processing complete. Loaded {} documents.".format(len(paths)))
    return texts, paths

# Run the setup
texts, document_paths = setup_environment()


Downloading dataset...


88121KB [00:02, 41965.22KB/s]


Extracting dataset...
Dataset downloaded and extracted.
Processing complete. Loaded 20 documents.


In [5]:
document_paths

[PosixPath('mimic_textbooks/textbooks/en/First_Aid_Step2.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Pathology_Robbins.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Histology_Ross.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Cell_Biology_Alberts.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Pathoma_Husain.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Pharmacology_Katzung.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Physiology_Levy.txt'),
 PosixPath('mimic_textbooks/textbooks/en/First_Aid_Step1.txt'),
 PosixPath('mimic_textbooks/textbooks/en/textbook1.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Immunology_Janeway.txt'),
 PosixPath('mimic_textbooks/textbooks/en/textbook2.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Anatomy_Gray.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Neurology_Adams.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Surgery_Schwartz.txt'),
 PosixPath('mimic_textbooks/textbooks/en/Biochemistry_Lippincott.txt'),
 PosixPath('mimic_textbooks/textb