In [1]:
import pandas as pd

In [1]:
from pathlib import Path
import os

base_dir = Path("datasets")
folders = [base_dir / "data_Q3_2024", base_dir / "data_Q3_2025"]

csv_files = []
for f in folders:
    if not f.exists():
        raise FileNotFoundError(f"Missing folder: {f.resolve()}")
    csv_files.extend(sorted(f.glob("*.csv")))

print("Found CSV files:", len(csv_files))
print("Example:", csv_files[0] if csv_files else "NONE")


Found CSV files: 184
Example: datasets/data_Q3_2024/2024-07-01.csv


In [2]:
for f in folders:
    files = sorted(f.glob("*.csv"))
    print(f"{f.name}: {len(files)} files")


data_Q3_2024: 92 files
data_Q3_2025: 92 files


In [None]:
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
import time

try:
    cpu_workers = len(os.sched_getaffinity(0))
except Exception:
    cpu_workers = os.cpu_count() or 1

# Use half of the cores, or you will be kicked out of the server!!!!!
max_workers = min(cpu_workers, len(csv_files)) if csv_files else cpu_workers
max_workers = max_workers//2
print("Using processes:", max_workers)

def _read_one_csv(abs_path: str, base_dir_str: str):
    """
    Executed inside a worker process: reads one CSV file and returns (key, DataFrame).
    Note: the function must be defined at the top level so it can be pickled
    by ProcessPoolExecutor.
    """
    p = Path(abs_path)
    base = Path(base_dir_str)

    # key: relative path from base_dir
    key = str(p.relative_to(base))

    df = pd.read_csv(p, low_memory=False)
    return key, df

t0 = time.time()
all_data = {}

with ProcessPoolExecutor(max_workers=max_workers) as ex:
    futures = [
        ex.submit(_read_one_csv, str(p.resolve()), str(base_dir.resolve()))
        for p in csv_files
    ]
    for fut in as_completed(futures):
        key, df = fut.result()
        all_data[key] = df

print("Loaded files:", len(all_data))
print("Seconds:", round(time.time() - t0, 3))


Using processes: 92
Loaded files: 184
Seconds: 223.188


In [5]:
total_failure_1 = 0
total_rows = 0

for key, df in all_data.items():
    if "failure" not in df.columns:
        print(f"[WARN] {key} has no column 'failure'")
        continue

    cnt = (df["failure"] == 1).sum()
    total_failure_1 += cnt
    total_rows += len(df)

print("Total rows:", total_rows)
print("Total failure == 1:", total_failure_1)
print("Failure rate:", total_failure_1 / total_rows)


Total rows: 56667438
Total failure == 1: 2678
Failure rate: 4.7258180262181606e-05


In [None]:
dfs = []
for key, df in all_data.items():
    temp = df.copy()
    temp["source"] = key   
    dfs.append(temp)

combined_df = pd.concat(dfs, axis=0, join="outer", ignore_index=True)

print("Merged shape:", combined_df.shape)
print("Total columns:", len(combined_df.columns))

out_file = "Q3_2024_2025_merged.parquet"
combined_df.to_parquet(out_file, index=False)

print("Saved to:", out_file)