In [0]:
import glob
import os
import pandas as pd
from collections import defaultdict

In [0]:
def merge_csv_by_prefix(folder=".", output_prefix="merged"):
    """
    같은 접두사를 가진 CSV 파일들을 합쳐 저장합니다.
    예: hf_community_*.csv → merged_hf_community.csv
        hf_files_*.csv     → merged_hf_files.csv
    """
    # 현재 폴더의 csv 파일 모으기
    files = glob.glob(os.path.join(folder, "*.csv"))
    
    groups = defaultdict(list)
    for f in files:
        basename = os.path.basename(f)
        # 파일명에서 접두사 추출 (마지막 '_' 앞까지)
        prefix = "_".join(basename.split("_")[:-1])
        groups[prefix].append(f)
    
    for prefix, group_files in groups.items():
        # 숫자 정렬을 위해 key=int 처리
        group_files_sorted = sorted(group_files, key=lambda x: int(os.path.splitext(x)[0].split("_")[-1]))
        
        dfs = [pd.read_csv(f, encoding='utf-8-sig', low_memory=False) for f in group_files_sorted]
        merged = pd.concat(dfs, ignore_index=True)
        
        output_file = f"{prefix}.csv"
        merged.to_csv(output_file, index=False, encoding='utf-8-sig')
        print(f"Saved {output_file} with {len(merged)} rows")

In [0]:
# 사용 예시
merge_csv_by_prefix(folder="split", output_prefix="merged")

  dfs = [pd.read_csv(f, encoding='utf-8-sig') for f in group_files_sorted]


Saved hf_community.csv with 317436 rows


[0;31m---------------------------------------------------------------------------[0m
[0;31mOSError[0m                                   Traceback (most recent call last)
File [0;32m/databricks/python/lib/python3.10/site-packages/pandas/io/formats/csvs.py:261[0m, in [0;36mCSVFormatter.save[0;34m(self)[0m
[1;32m    251[0m [38;5;28mself[39m[38;5;241m.[39mwriter [38;5;241m=[39m csvlib[38;5;241m.[39mwriter(
[1;32m    252[0m     handles[38;5;241m.[39mhandle,
[1;32m    253[0m     lineterminator[38;5;241m=[39m[38;5;28mself[39m[38;5;241m.[39mlineterminator,
[0;32m   (...)[0m
[1;32m    258[0m     quotechar[38;5;241m=[39m[38;5;28mself[39m[38;5;241m.[39mquotechar,
[1;32m    259[0m )
[0;32m--> 261[0m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_save[49m[43m([49m[43m)[49m

File [0;32m/databricks/python/lib/python3.10/site-packages/pandas/io/formats/csvs.py:266[0m, in [0;36mCSVFormatter._save[0;34m(self)[0m
[1;32m    265[0m     [38;5;