In [8]:
import pandas as pd
import os
import re

def merge_seed_csvs(folder_path, output_dir=None):
    """
    合并指定文件夹中不同seed的B_511和B_521 csv文件
    
    参数:
    folder_path (str): 包含csv文件的文件夹路径
    output_dir (str): 输出文件的目录，如果为None则输出到原文件夹
    """
    
    if output_dir is None:
        output_dir = folder_path
    
    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)
    
    # 用于存储不同前缀的文件数据
    file_groups = {}
    
    # 遍历文件夹中的所有文件
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # 只处理csv文件
        if not filename.endswith('.csv'):
            continue
            
        # 匹配文件名模式：B_511_seedXXXX.csv 或 B_521_seedXXXX.csv
        match = re.match(r'(B_5[12]1)_seed\d+\.csv', filename)
        
        if match:
            prefix = match.group(1)  # 获取前缀 (B_511 或 B_521)
            
            try:
                # 读取csv文件
                df = pd.read_csv(file_path)
                
                # 添加一个列来记录原始文件名（可选）
                df['source_file'] = filename
                
                # 将数据添加到对应的分组中
                if prefix not in file_groups:
                    file_groups[prefix] = []
                file_groups[prefix].append(df)
                
                print(f"已读取: {filename}")
                
            except Exception as e:
                print(f"读取文件 {filename} 时出错: {e}")
    
    # 合并并保存每个分组的数据
    for prefix, dfs in file_groups.items():
        if dfs:
            try:
                # 合并所有DataFrame
                combined_df = pd.concat(dfs, ignore_index=True)
                
                # 输出文件路径
                output_path = os.path.join(output_dir, f"{prefix}.csv")
                
                # 保存到csv文件
                combined_df.to_csv(output_path, index=False)
                print(f"\n已成功合并并保存: {output_path}")
                print(f"合并了 {len(dfs)} 个文件")
                print(f"总行数: {len(combined_df)}")
                
            except Exception as e:
                print(f"合并 {prefix} 文件时出错: {e}")
        else:
            print(f"没有找到 {prefix} 相关的文件")
    
    return file_groups

def merge_seed_csvs_with_options(folder_path, output_dir=None, add_source_column=True):
    """
    更灵活的合并函数，带有更多选项
    
    参数:
    folder_path (str): 包含csv文件的文件夹路径
    output_dir (str): 输出文件的目录
    add_source_column (bool): 是否添加源文件名列
    """
    
    if output_dir is None:
        output_dir = folder_path
    
    os.makedirs(output_dir, exist_ok=True)
    
    # 使用字典存储不同模式的文件
    patterns = {
        'B_511': re.compile(r'B_511_seed\d+\.csv'),
        'B_521': re.compile(r'B_521_seed\d+\.csv')
    }
    
    results = {}
    
    for pattern_name, pattern in patterns.items():
        matched_dfs = []
        
        for filename in os.listdir(folder_path):
            print(filename)
            if pattern.match(filename):
                file_path = os.path.join(folder_path, filename)
                print(file_path)
                try:
                    df = pd.read_csv(file_path)
                    
                    if add_source_column:
                        df['source_file'] = filename
                    
                    matched_dfs.append(df)
                    print(f"已读取: {filename} -> {pattern_name}")
                    
                except Exception as e:
                    print(f"读取文件 {filename} 时出错: {e}")
        
        if matched_dfs:
            # 合并数据
            combined_df = pd.concat(matched_dfs, ignore_index=True)
            
            # 保存文件
            output_path = os.path.join(output_dir, f"{pattern_name}.csv")
            combined_df.to_csv(output_path, index=False)
            
            results[pattern_name] = {
                'file_count': len(matched_dfs),
                'total_rows': len(combined_df),
                'output_path': output_path
            }
            
            print(f"\n✅ {pattern_name}: 合并了 {len(matched_dfs)} 个文件")
            print(f"   总行数: {len(combined_df)}")
            print(f"   保存到: {output_path}")
        else:
            print(f"\n⚠️  未找到 {pattern_name} 相关的文件")
    
    return results

# 使用示例
if __name__ == "__main__":
    # 示例用法1：基本用法
    folder = "/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/14TeV_LLP/14TeV_B/"
    # result = merge_seed_csvs(folder)
    
    # 示例用法2：指定输出目录
    result = merge_seed_csvs_with_options(folder, output_dir="/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/14TeV_LLP", add_source_column=False)
    
    # 示例用法3：使用更灵活的函数（不添加源文件名列）
    # result = merge_seed_csvs_with_options(
    #     folder, 
    #     output_dir="/path/to/output",
    #     add_source_column=False
    # )

B_511_seed000.csv
/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/14TeV_LLP/14TeV_B/B_511_seed000.csv
已读取: B_511_seed000.csv -> B_511
B_521_seed000.csv

✅ B_511: 合并了 1 个文件
   总行数: 1730626
   保存到: /media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/14TeV_LLP/B_511.csv
B_511_seed000.csv
B_521_seed000.csv
/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/14TeV_LLP/14TeV_B/B_521_seed000.csv
已读取: B_521_seed000.csv -> B_521

✅ B_521: 合并了 1 个文件
   总行数: 1733661
   保存到: /media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/14TeV_LLP/B_521.csv


In [9]:
# csv = '/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/B_blocks/test_scan_178/llp_simulation_results/incremental_results/llp_0130_result.csv'

import pandas as pd
import os
import re

def merge_csv(folder_path, out_path):
    all_df = pd.DataFrame()
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.csv'):
            df = pd.read_csv(file_path)
            all_df = pd.concat([all_df, df], ignore_index=True)
    all_df.to_csv(out_path+ '/merged.csv', index=False)
    print(f"Merged {len(os.listdir(folder_path))} files into {out_path}")
    return all_df

def merge_seed_csvs(folder_path, output_dir=None):
    """
    合并指定文件夹中不同seed的B_511和B_521 csv文件
    
    参数:
    folder_path (str): 包含csv文件的文件夹路径
    output_dir (str): 输出文件的目录，如果为None则输出到原文件夹
    """
    
    if output_dir is None:
        output_dir = folder_path
    
    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)
    
    # 用于存储不同前缀的文件数据
    file_groups = {}
    
    # 遍历文件夹中的所有文件
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # 只处理csv文件
        if not filename.endswith('.csv'):
            continue

        # 匹配文件名模式：llp_0130_result.csv 或 llp_0131_result.csv 等
        # 规范化文件名以便后续的正则能匹配到，例如: 'llp_0130_result.csv' -> 'llp_0130.csv'
        match = re.search(r'(llp_\d+)', filename)
        if match:
            filename = f"{match.group(1)}.csv"
        # match = re.match(r'llp_\d+\.csv', filename)
        print(f"Processing file: {filename}, Match: {match}")
        
        if match:
            prefix = match.group(0)  # 获取前缀 (llp_0130.csv, llp_0131.csv, etc.)
            
            try:
                # 读取csv文件
                df = pd.read_csv(file_path)
                
                # 添加一个列来记录原始文件名（可选）
                # df['source_file'] = filename
                
                # 将数据添加到对应的分组中
                if prefix not in file_groups:
                    file_groups[prefix] = []
                file_groups[prefix].append(df)
                
                print(f"已读取: {filename}")
                
            except Exception as e:
                print(f"读取文件 {filename} 时出错: {e}")
    
    # 合并并保存每个分组的数据
    for prefix, dfs in file_groups.items():
        if dfs:
            try:
                # 合并所有DataFrame
                combined_df = pd.concat(dfs, ignore_index=True)
                
                # 输出文件路径
                output_path = os.path.join(output_dir, f"{prefix}.csv")
                
                # 保存到csv文件
                combined_df.to_csv(output_path, index=False)
                print(f"\n已成功合并并保存: {output_path}")
                print(f"合并了 {len(dfs)} 个文件")
                print(f"总行数: {len(combined_df)}")
                
            except Exception as e:
                print(f"合并 {prefix} 文件时出错: {e}")
        else:
            print(f"没有找到 {prefix} 相关的文件")
    
    return file_groups

def merge_seed_csvs_with_options(folder_path, output_dir=None, add_source_column=True):
    """
    更灵活的合并函数，带有更多选项
    
    参数:
    folder_path (str): 包含csv文件的文件夹路径
    output_dir (str): 输出文件的目录
    add_source_column (bool): 是否添加源文件名列
    """
    
    if output_dir is None:
        output_dir = folder_path
    
    os.makedirs(output_dir, exist_ok=True)
    
    # 使用字典存储不同模式的文件
    patterns = {
        'B_511': re.compile(r'B_511_seed\d+\.csv'),
        'B_521': re.compile(r'B_521_seed\d+\.csv')
    }
    
    results = {}
    
    for pattern_name, pattern in patterns.items():
        matched_dfs = []
        
        for filename in os.listdir(folder_path):
            if pattern.match(filename):
                file_path = os.path.join(folder_path, filename)
                
                try:
                    df = pd.read_csv(file_path)
                    
                    if add_source_column:
                        df['source_file'] = filename
                    
                    matched_dfs.append(df)
                    print(f"已读取: {filename} -> {pattern_name}")
                    
                except Exception as e:
                    print(f"读取文件 {filename} 时出错: {e}")
        
        if matched_dfs:
            # 合并数据
            combined_df = pd.concat(matched_dfs, ignore_index=True)
            
            # 保存文件
            output_path = os.path.join(output_dir, f"{pattern_name}.csv")
            combined_df.to_csv(output_path, index=False)
            
            results[pattern_name] = {
                'file_count': len(matched_dfs),
                'total_rows': len(combined_df),
                'output_path': output_path
            }
            
            print(f"\n✅ {pattern_name}: 合并了 {len(matched_dfs)} 个文件")
            print(f"   总行数: {len(combined_df)}")
            print(f"   保存到: {output_path}")
        else:
            print(f"\n⚠️  未找到 {pattern_name} 相关的文件")
    
    return results

# 使用示例
if __name__ == "__main__":
    # 示例用法1：基本用法
    folder = "/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/14TeV_LLP/14TeV_B"
    result = merge_csv(folder, out_path='/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/14TeV_LLP')
    
    # 示例用法2：指定输出目录
    # result = merge_seed_csvs_with_options(folder, output_dir="/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Decay_B2025-12-03_2HDM_B_test", add_source_column=False)
    
    # 示例用法3：使用更灵活的函数（不添加源文件名列）
    # result = merge_seed_csvs_with_options(
    #     folder, 
    #     output_dir="/path/to/output",
    #     add_source_column=False
    # )

Merged 2 files into /media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Test/14TeV_LLP


In [3]:
def merge_B(folder, out):
    all_df = pd.DataFrame()
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        if filename.endswith('.csv'):
            df = pd.read_csv(file_path)
            print(f"Reading file: {filename}")
            all_df = pd.concat([all_df, df], ignore_index=True)
    all_df.to_csv(out+ '/merged_B.csv', index=False)
    print(f"Merged {len(os.listdir(folder))} files into {out}")
    return all_df

merge_B('/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Decay_B/13TeV/2025-12-27_B_13TeV/', '/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Decay_B/13TeV/2025-12-27_B_13TeV')

Reading file: B_511.csv
Reading file: B_521.csv
Merged 4 files into /media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Decay_B/13TeV/2025-12-27_B_13TeV


Unnamed: 0,tau_input,decay_z,decay_x,e,pz,seed,decay_y,py,decay_t,px,total_events,m,id
0,3567550.0,-0.015358,-0.000125,44.1493,-43.82970,659230,0.000119,0.339456,0.015470,-0.355425,2,5.27958,511
1,3567550.0,1.913640,0.210519,96.8466,95.64330,659230,-0.193009,-9.646530,1.937720,10.521700,4,5.27958,511
2,3567550.0,0.035220,0.000008,123.8110,123.67000,659230,-0.000753,-2.644090,0.035260,0.027057,7,5.27958,511
3,3567550.0,-0.123037,-0.008053,25.6489,-25.02540,659230,0.005010,1.019020,0.126102,-1.637920,10,5.27958,511
4,3567550.0,0.108063,0.151274,12.7241,6.64133,659230,0.030382,1.867220,0.207037,9.297010,11,5.27958,511
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3633029,3567550.0,-79.005200,-0.695648,387.7580,-387.64200,801070,1.441670,7.073630,79.028700,-3.413230,39983,5.27925,521
3633030,3567550.0,-0.212545,0.064415,14.8988,-13.29090,801070,0.017745,1.109640,0.238258,4.028000,39991,5.27925,521
3633031,3567550.0,-0.184012,0.001747,45.1873,-44.56610,801070,-0.021732,-5.263270,0.186577,0.423139,39992,5.27925,521
3633032,3567550.0,-5.533700,0.109072,44.1473,-43.82200,801070,0.007055,0.055866,5.574790,0.863750,39998,5.27925,521


In [3]:
import pandas as pd
df = pd.read_csv('/media/ubuntu/6156e08b-fdb1-4cde-964e-431f74a6078e/Files/LLP_DATA/Decay_B/13TeV/2025-12-27_B_13TeV/merged_B.csv')
print(df[:-1].index)

RangeIndex(start=0, stop=3633033, step=1)
