In [1]:
import pandas as pd

In [25]:
from pathlib import Path
import zipfile

base_dir = Path("datasets").resolve()

zip_files = list(base_dir.glob("*.zip"))
print("Found zip files:", [z.name for z in zip_files])

for zf in zip_files:
    out_dir = base_dir / zf.stem   # use the zip file name without .zip
    out_dir.mkdir(exist_ok=True)

    with zipfile.ZipFile(zf, 'r') as z:
        z.extractall(out_dir)
        print(f"Extracted {zf.name} -> {out_dir}")

Found zip files: ['data_Q3_2019.zip', 'data_Q1_2018.zip', 'data_Q3_2020.zip', 'data_Q2_2018.zip', 'data_Q2_2019.zip', 'data_Q2_2017.zip', 'data_2013.zip', 'data_Q1_2016.zip', 'data_Q4_2021.zip', 'data_Q3_2016.zip', 'data_Q2_2020.zip', 'data_2014.zip', 'data_2015.zip', 'data_Q4_2020.zip', 'data_Q4_2017.zip', 'data_Q1_2019.zip', 'data_Q4_2019.zip', 'data_Q1_2024.zip', 'data_Q3_2017.zip', 'data_Q2_2021.zip', 'data_Q3_2018.zip', 'data_Q1_2017.zip', 'data_Q2_2016.zip', 'data_Q1_2021.zip', 'data_Q1_2020.zip', 'data_Q4_2018.zip', 'data_Q4_2016.zip', 'data_Q3_2021.zip']
Extracted data_Q3_2019.zip -> /work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_Q3_2019
Extracted data_Q1_2018.zip -> /work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_Q1_2018
Extracted data_Q3_2020.zip -> /work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/dat

In [67]:
from pathlib import Path
import os

# base_dir = Path("datasets")
base_dir = Path("/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets")

# 22 - 25
# folders = [base_dir / "data_Q4_2022", base_dir / "data_Q4_2023",base_dir / "data_Q4_2024"] # Q4
# folders = [base_dir / "data_Q3_2024", base_dir / "data_Q3_2025",base_dir / "data_Q3_2023", base_dir / "data_Q3_2022"] # Q3
# folders = [base_dir / "data_Q2_2022", base_dir / "data_Q2_2023",base_dir / "data_Q2_2024", base_dir / "data_Q2_2025"] # Q2
# folders = [base_dir / "data_Q1_2022", base_dir / "data_Q1_2023",base_dir / "data_Q1_2024", base_dir / "data_Q1_2025"] # Q1

# 16-21
# folders = [base_dir / "data_Q4_2016", base_dir / "data_Q4_2017",base_dir / "data_Q4_2018", base_dir / "data_Q4_2019", base_dir / "data_Q4_2020", base_dir / "data_Q4_2021"] # Q4
# folders = [base_dir / "data_Q3_2016", base_dir / "data_Q3_2017",base_dir / "data_Q3_2018", base_dir / "data_Q3_2019", base_dir / "data_Q3_2020", base_dir / "data_Q3_2021"] # Q3
# folders = [base_dir / "data_Q2_2016", base_dir / "data_Q2_2017",base_dir / "data_Q2_2018", base_dir / "data_Q2_2019", base_dir / "data_Q2_2020", base_dir / "data_Q2_2021"] # Q2
# folders = [base_dir / "data_Q1_2016", base_dir / "data_Q1_2017",base_dir / "data_Q1_2018", base_dir / "data_Q1_2019", base_dir / "data_Q1_2020", base_dir / "data_Q1_2021"] # Q1

# 13-15
folders = [base_dir / "data_2013"/"2013", base_dir / "data_2014"/"2014",base_dir / "data_2015"/"2015"] # Q1

csv_files = []
for f in folders:
    if not f.exists():
        raise FileNotFoundError(f"Missing folder: {f.resolve()}")
    csv_files.extend(sorted(f.glob("*.csv")))

print("Found CSV files:", len(csv_files))
print("Example:", csv_files[0] if csv_files else "NONE")


Found CSV files: 996
Example: /work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_2013/2013/2013-04-10.csv


In [68]:
for f in folders:
    files = sorted(f.glob("*.csv"))
    print(f"{f.name}: {len(files)} files")


2013: 266 files
2014: 365 files
2015: 365 files


In [69]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import time

try:
    cpu_workers = len(os.sched_getaffinity(0))
except Exception:
    cpu_workers = os.cpu_count() or 1

# Use half of the cores, or you will be kicked out of the server!!!!!
max_workers = min(cpu_workers, len(csv_files)) if csv_files else cpu_workers
max_workers = max_workers//2
print("Using processes:", max_workers)

def _read_one_csv(abs_path: str, base_dir_str: str):
    """
    Executed inside a worker process: reads one CSV file and returns (key, DataFrame).
    Note: the function must be defined at the top level so it can be pickled
    by ProcessPoolExecutor.
    """
    p = Path(abs_path)
    base = Path(base_dir_str)

    # key: relative path from base_dir
    key = str(p.relative_to(base))

    df = pd.read_csv(p, low_memory=False)

    if "failure" in df.columns:
        df = df[df["failure"] == 1]
    else:
        # If no 'failure' column, return empty DataFrame with same columns
        df = df.iloc[0:0]
    return key, df

t0 = time.time()
all_data = {}

with ProcessPoolExecutor(max_workers=max_workers) as ex:
    futures = [
        ex.submit(_read_one_csv, str(p.resolve()), str(base_dir.resolve()))
        for p in csv_files
    ]
    for fut in as_completed(futures):
        key, df = fut.result()
        all_data[key] = df

print("Loaded files:", len(all_data))
print("Seconds:", round(time.time() - t0, 3))


Using processes: 128
Loaded files: 996
Seconds: 17.655


In [70]:
total_rows = sum(len(df) for df in all_data.values())

print("Total rows with failure == 1:", total_rows)

Total rows with failure == 1: 4346


In [71]:
# Merge all DataFrames into one, how = 'outer' to include all columns
merged_df = pd.concat(all_data.values(), axis=0, ignore_index=True)

out_file = "merged_13-15_failure1.csv"

merged_df.to_csv(out_file, index=False)

print("Saved merged failure==1 rows to:", out_file)
print("Total rows written:", len(merged_df))


  merged_df = pd.concat(all_data.values(), axis=0, ignore_index=True)


Saved merged failure==1 rows to: merged_13-15_failure1.csv
Total rows written: 4346


In [23]:
df = pd.read_csv(out_file, low_memory=False)
df.head

<bound method NDFrame.head of             date serial_number                 model  capacity_bytes  failure  \
0     2022-04-02      ZGS01JWR            ST500LM030    500107862016        1   
1     2022-04-02      ZHZ62HSV         ST12000NM0008  12000138625024        1   
2     2022-04-02      ZHZ65PAT         ST12000NM0008  12000138625024        1   
3     2022-04-02  X880A0H9F97G   TOSHIBA MG07ACA14TA  14000519643136        1   
4     2022-04-02      ZL2CH3NJ         ST14000NM001G  14000519643136        1   
...          ...           ...                   ...             ...      ...   
4471  2025-06-17      2BJD0GDD   WDC WUH721816ALE6L0  16000900661248        1   
4472  2025-06-17  8190A0GUFV8G  TOSHIBA MG08ACA16TEY  16000900661248        1   
4473  2025-06-17  8190A1LGFV8G  TOSHIBA MG08ACA16TEY  16000900661248        1   
4474  2025-06-17  44M0B03YV4MJ   TOSHIBA MG10ACA20TE  20000588955648        1   
4475  2025-06-17      ZYD1CGB1         ST24000NM002H  24000277250048        1  