In [2]:
import pandas as pd
import os
import re

def merge_seed_csvs(folder_path, output_dir=None):
    """
    合并指定文件夹中不同seed的B_511和B_521 csv文件
    
    参数:
    folder_path (str): 包含csv文件的文件夹路径
    output_dir (str): 输出文件的目录，如果为None则输出到原文件夹
    """
    
    if output_dir is None:
        output_dir = folder_path
    
    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)
    
    # 用于存储不同前缀的文件数据
    file_groups = {}
    
    # 遍历文件夹中的所有文件
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # 只处理csv文件
        if not filename.endswith('.csv'):
            continue
            
        # 匹配文件名模式：B_511_seedXXXX.csv 或 B_521_seedXXXX.csv
        match = re.match(r'(B_5[12]1)_seed\d+\.csv', filename)
        
        if match:
            prefix = match.group(1)  # 获取前缀 (B_511 或 B_521)
            
            try:
                # 读取csv文件
                df = pd.read_csv(file_path)
                
                # 添加一个列来记录原始文件名（可选）
                df['source_file'] = filename
                
                # 将数据添加到对应的分组中
                if prefix not in file_groups:
                    file_groups[prefix] = []
                file_groups[prefix].append(df)
                
                print(f"已读取: {filename}")
                
            except Exception as e:
                print(f"读取文件 {filename} 时出错: {e}")
    
    # 合并并保存每个分组的数据
    for prefix, dfs in file_groups.items():
        if dfs:
            try:
                # 合并所有DataFrame
                combined_df = pd.concat(dfs, ignore_index=True)
                
                # 输出文件路径
                output_path = os.path.join(output_dir, f"{prefix}.csv")
                
                # 保存到csv文件
                combined_df.to_csv(output_path, index=False)
                print(f"\n已成功合并并保存: {output_path}")
                print(f"合并了 {len(dfs)} 个文件")
                print(f"总行数: {len(combined_df)}")
                
            except Exception as e:
                print(f"合并 {prefix} 文件时出错: {e}")
        else:
            print(f"没有找到 {prefix} 相关的文件")
    
    return file_groups

def merge_seed_csvs_with_options(folder_path, output_dir=None, add_source_column=True):
    """
    更灵活的合并函数，带有更多选项
    
    参数:
    folder_path (str): 包含csv文件的文件夹路径
    output_dir (str): 输出文件的目录
    add_source_column (bool): 是否添加源文件名列
    """
    
    if output_dir is None:
        output_dir = folder_path
    
    os.makedirs(output_dir, exist_ok=True)
    
    # 使用字典存储不同模式的文件
    patterns = {
        'B_511': re.compile(r'B_511_seed\d+\.csv'),
        'B_521': re.compile(r'B_521_seed\d+\.csv')
    }
    
    results = {}
    
    for pattern_name, pattern in patterns.items():
        matched_dfs = []
        
        for filename in os.listdir(folder_path):
            if pattern.match(filename):
                file_path = os.path.join(folder_path, filename)
                
                try:
                    df = pd.read_csv(file_path)
                    
                    if add_source_column:
                        df['source_file'] = filename
                    
                    matched_dfs.append(df)
                    print(f"已读取: {filename} -> {pattern_name}")
                    
                except Exception as e:
                    print(f"读取文件 {filename} 时出错: {e}")
        
        if matched_dfs:
            # 合并数据
            combined_df = pd.concat(matched_dfs, ignore_index=True)
            
            # 保存文件
            output_path = os.path.join(output_dir, f"{pattern_name}.csv")
            combined_df.to_csv(output_path, index=False)
            
            results[pattern_name] = {
                'file_count': len(matched_dfs),
                'total_rows': len(combined_df),
                'output_path': output_path
            }
            
            print(f"\n✅ {pattern_name}: 合并了 {len(matched_dfs)} 个文件")
            print(f"   总行数: {len(combined_df)}")
            print(f"   保存到: {output_path}")
        else:
            print(f"\n⚠️  未找到 {pattern_name} 相关的文件")
    
    return results

# 使用示例
if __name__ == "__main__":
    # 示例用法1：基本用法
    folder = "/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Decay_B2025-12-03_2HDM_B_test/LLP_data"
    # result = merge_seed_csvs(folder)
    
    # 示例用法2：指定输出目录
    result = merge_seed_csvs_with_options(folder, output_dir="/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Decay_B2025-12-03_2HDM_B_test", add_source_column=False)
    
    # 示例用法3：使用更灵活的函数（不添加源文件名列）
    # result = merge_seed_csvs_with_options(
    #     folder, 
    #     output_dir="/path/to/output",
    #     add_source_column=False
    # )

已读取: B_511_seed443699.csv -> B_511
已读取: B_511_seed96495.csv -> B_511
已读取: B_511_seed438768.csv -> B_511
已读取: B_511_seed89831.csv -> B_511
已读取: B_511_seed627755.csv -> B_511
已读取: B_511_seed168638.csv -> B_511
已读取: B_511_seed673078.csv -> B_511
已读取: B_511_seed151292.csv -> B_511
已读取: B_511_seed917094.csv -> B_511
已读取: B_511_seed583821.csv -> B_511
已读取: B_511_seed617254.csv -> B_511
已读取: B_511_seed925593.csv -> B_511
已读取: B_511_seed439554.csv -> B_511
已读取: B_511_seed112079.csv -> B_511
已读取: B_511_seed728879.csv -> B_511
已读取: B_511_seed67965.csv -> B_511
已读取: B_511_seed689990.csv -> B_511
已读取: B_511_seed786331.csv -> B_511
已读取: B_511_seed555683.csv -> B_511
已读取: B_511_seed405183.csv -> B_511
已读取: B_511_seed934185.csv -> B_511
已读取: B_511_seed208857.csv -> B_511
已读取: B_511_seed573320.csv -> B_511
已读取: B_511_seed377445.csv -> B_511
已读取: B_511_seed781074.csv -> B_511
已读取: B_511_seed895020.csv -> B_511
已读取: B_511_seed624401.csv -> B_511
已读取: B_511_seed429264.csv -> B_511
已读取: B_511_seed79067.cs