In [4]:
from pathlib import Path
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
import os
from collections import Counter

base_dir = Path("/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets")

folders = [
    # 22 - 25
    base_dir / "data_Q4_2022", base_dir / "data_Q4_2023", base_dir / "data_Q4_2024",
    base_dir / "data_Q3_2024", base_dir / "data_Q3_2025", base_dir / "data_Q3_2023", base_dir / "data_Q3_2022",
    base_dir / "data_Q2_2022", base_dir / "data_Q2_2023", base_dir / "data_Q2_2024", base_dir / "data_Q2_2025",
    base_dir / "data_Q1_2022", base_dir / "data_Q1_2023", base_dir / "data_Q1_2024", base_dir / "data_Q1_2025",

    # 16 - 21
    base_dir / "data_Q4_2016", base_dir / "data_Q4_2017", base_dir / "data_Q4_2018",
    base_dir / "data_Q4_2019", base_dir / "data_Q4_2020", base_dir / "data_Q4_2021",
    base_dir / "data_Q3_2016", base_dir / "data_Q3_2017", base_dir / "data_Q3_2018",
    base_dir / "data_Q3_2019", base_dir / "data_Q3_2020", base_dir / "data_Q3_2021",
    base_dir / "data_Q2_2016", base_dir / "data_Q2_2017", base_dir / "data_Q2_2018",
    base_dir / "data_Q2_2019", base_dir / "data_Q2_2020", base_dir / "data_Q2_2021",
    base_dir / "data_Q1_2016", base_dir / "data_Q1_2017", base_dir / "data_Q1_2018",
    base_dir / "data_Q1_2019", base_dir / "data_Q1_2020", base_dir / "data_Q1_2021",

    # 13 - 15
    base_dir / "data_2013" / "2013",
    base_dir / "data_2014" / "2014",
    base_dir / "data_2015" / "2015",
]

def count_models(csv_path):
    try:
        s = pd.read_csv(
            csv_path,
            usecols=["model"],
            encoding="latin1",      # 关键：不用默认utf-8
            low_memory=False
        )["model"]
        vc = s.value_counts()
        return vc.to_dict(), int(vc.sum())
    except Exception as e:
        return {"__ERROR__": f"{csv_path} -> {e}"}, 0


# 收集所有CSV
csv_files = []
for folder in folders:
    files = list(folder.rglob("*.csv"))
    print(f"{folder}: {len(files)} files")
    csv_files.extend(files)

print("Total CSV files:", len(csv_files))

# 使用一半CPU核
total_cores = os.cpu_count()
half_cores = max(1, total_cores // 2)
print(f"CPU cores: {total_cores}, Using: {half_cores}")

global_counter = Counter()
total_rows = 0
errors = []

with ProcessPoolExecutor(max_workers=half_cores) as exe:
    futures = [exe.submit(count_models, str(f)) for f in csv_files]
    for fut in as_completed(futures):
        d, n = fut.result()
        total_rows += n
        if "__ERROR__" in d:
            errors.append(d["__ERROR__"])
        else:
            global_counter.update(d)

# 生成统计表
stats = (
    pd.Series(global_counter, name="count")
      .sort_values(ascending=False)
      .to_frame()
)
stats["ratio"] = stats["count"] / total_rows
stats["ratio_percent"] = stats["ratio"] * 100

print("Total samples:", total_rows)
print(stats.head(20))

out_file = "model_distribution_all_years_streaming.csv"
stats.to_csv(out_file)
print("Saved to:", out_file)

if errors:
    print(f"\n[WARN] {len(errors)} files failed. First 5 errors:")
    for e in errors[:5]:
        print(e)


/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_Q4_2022: 92 files
/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_Q4_2023: 92 files
/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_Q4_2024: 92 files
/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_Q3_2024: 92 files
/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_Q3_2025: 92 files
/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_Q3_2023: 92 files
/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.M.A.R.T/datasets/data_Q3_2022: 92 files
/work/shibberu/share/MA384_Data_Mining_Projects_Winter_2025-26/SMART_failure_prediction/S.