In [2]:
import os
import pandas as pd
import glob
import re
pd.set_option('display.max_rows', None)



In [3]:

# --- 1. 配置区 ---
# 您只需要修改这里的模型名称即可切换不同模型的分析
MODEL_NAME = 'DCNv2' # 可替换为 'DeepFM', 'AutoInt', 'DeepIM' 等

# 定义项目根目录
project_root = '/data2/wangzhongren/taolin_project/FuxiCTR/model_zoo/'
model_dir = os.path.join(project_root, MODEL_NAME)

# --- 2. 正则表达式定义 ---
# 这个正则表达式是核心，用于从文件名中解析出所有参数
# 文件名示例: beauty_me_scala1_seed20_lr0.01.log
# 文件名示例: beauty_base_scala0_seed20_lr0.01.log
log_pattern = re.compile(
    r'(?P<dataset>\w+)_'          # 数据集 (e.g., beauty)
    r'(?P<method>\w+)_'           # 方法 (e.g., me, base)
    r'scala(?P<scale>\d+)_'       # 尺度 (e.g., 1, 0)
    r'seed(?P<seed>\d+)_'         # 种子 (e.g., 20)
    r'lr(?P<lr>[\d\.]+)'          # 学习率 (e.g., 0.01)
    r'\.log'
)

all_results_data = []

# 使用 glob 查找所有实验文件夹下的所有 .log 文件
# `**` 表示递归搜索所有子目录
search_path = os.path.join(model_dir, '**', '*.log')
log_files = glob.glob(search_path, recursive=True)

print(f"在 '{model_dir}' 目录下共找到 {len(log_files)} 个 .log 文件。开始解析...")

for log_file_path in log_files:
    # 从完整路径中获取文件名
    filename = os.path.basename(log_file_path)
    
    # 使用正则表达式匹配文件名
    match = log_pattern.match(filename)
    
    if not match:
        # 如果文件名不符合我们的格式，就跳过
        # print(f"  - 警告: 文件名 '{filename}' 不匹配，已跳过。")
        continue
        
    # 从匹配结果中提取所有参数
    params = match.groupdict()
    
    try:
        # 读取日志文件的最后一行来获取 AUC 和 logloss
        with open(log_file_path, 'r') as file:
            lines = file.readlines()
            if not lines:
                # print(f"  - 警告: 文件 '{filename}' 为空，已跳过。")
                continue
            
            last_line = lines[-1].strip()
            if 'AUC' in last_line and 'logloss' in last_line:
                parts = last_line.split()
                auc = float(parts[parts.index('AUC:') + 1])
                logloss = float(parts[parts.index('logloss:') + 1])
                
                # 将提取的所有信息存入字典
                result_entry = {
                    'Model': MODEL_NAME,
                    'Dataset': params['dataset'],
                    'Method': params['method'],
                    'Scale': int(params['scale']),
                    'Seed': int(params['seed']),
                    'LR': float(params['lr']),
                    'AUC': auc,
                    'Logloss': logloss
                }
                all_results_data.append(result_entry)
            else:
                # print(f"  - 警告: 文件 '{filename}' 的最后一行格式不正确，已跳过。")
                pass

    except (IOError, IndexError, ValueError) as e:
        print(f"  - 错误: 处理文件 '{filename}' 时出错: {e}")

# --- 4. 结果汇总与展示 ---
if not all_results_data:
    print("\n未能解析到任何有效数据，请检查路径和文件格式。")
else:
    # 将数据列表转换为 DataFrame
    df = pd.DataFrame(all_results_data)

    # 为了更好的可读性，对 DataFrame 进行排序
    df_sorted = df.sort_values(by=['Dataset', 'Method', 'Scale', 'LR', 'Seed']).reset_index(drop=True)

    print("\n--- 原始数据汇总 ---")
    print(df_sorted)

    # --- 5. (可选) 生成聚合报告 ---
    # 按方法、尺度、数据集和学习率分组，计算 AUC 和 Logloss 的平均值和标准差
    summary = df_sorted.groupby(['Dataset', 'Method', 'Scale', 'LR']).agg(
        Mean_AUC=('AUC', 'mean'),
        Std_AUC=('AUC', 'std'),
        Mean_Logloss=('Logloss', 'mean'),
        Run_Count=('Seed', 'count') # 统计运行了多少个种子
    ).reset_index()

    print("\n--- 聚合分析报告 (按种子求平均) ---")
    print(summary)

    


在 '/data2/wangzhongren/taolin_project/FuxiCTR/model_zoo/DCNv2' 目录下共找到 290 个 .log 文件。开始解析...

--- 原始数据汇总 ---
     Model Dataset Method  Scale  Seed     LR     AUC  Logloss
0    DCNv2  beauty   base      0    20  0.001  0.6698   0.5068
1    DCNv2  beauty   base      0   201  0.001  0.6711   0.5061
2    DCNv2  beauty   base      0  1027  0.001  0.6687   0.5076
3    DCNv2  beauty   base      0    20  0.005  0.6701   0.5069
4    DCNv2  beauty   base      0   201  0.005  0.6711   0.5058
5    DCNv2  beauty   base      0  1027  0.005  0.6702   0.5077
6    DCNv2  beauty   base      0    20  0.010  0.6713   0.5064
7    DCNv2  beauty   base      0   201  0.010  0.6717   0.5058
8    DCNv2  beauty   base      0  1027  0.010  0.6708   0.5064
9    DCNv2  beauty     me      1    20  0.001  0.6732   0.5052
10   DCNv2  beauty     me      1   201  0.001  0.6724   0.5056
11   DCNv2  beauty     me      1  1027  0.001  0.6731   0.5053
12   DCNv2  beauty     me      1    20  0.005  0.6768   0.5032
13   DCNv2

In [4]:
summary

Unnamed: 0,Dataset,Method,Scale,LR,Mean_AUC,Std_AUC,Mean_Logloss,Run_Count
0,beauty,base,0,0.001,0.669867,0.001201,0.506833,3
1,beauty,base,0,0.005,0.670467,0.000551,0.5068,3
2,beauty,base,0,0.01,0.671267,0.000451,0.5062,3
3,beauty,me,1,0.001,0.6729,0.000436,0.505367,3
4,beauty,me,1,0.005,0.676433,0.000907,0.503533,3
5,beauty,me,1,0.01,0.676667,0.000503,0.503633,3
6,beauty,me,3,0.001,0.667133,0.00265,0.508233,3
7,beauty,me,3,0.005,0.675167,0.000764,0.5043,3
8,beauty,me,3,0.01,0.673567,0.000929,0.505333,3
9,beauty,me,7,0.001,0.654567,0.006658,0.5625,3


In [8]:

TARGET_LR = 0.005
    
print(f"\n--- 特定分析：仅针对 LR = {TARGET_LR} 的结果进行聚合 ---")
# 步骤1: 筛选出 LR 等于目标值的数据
df_lr_filtered = df_sorted[df_sorted['LR'] == TARGET_LR].copy()

if df_lr_filtered.empty:
    print(f"未找到任何 LR = {TARGET_LR} 的实验数据。")
else:
    # 步骤2: 对筛选后的数据进行分组和聚合 (平均)
    # 注意：因为LR已经固定，所以在groupby中可以省略'LR'，但为了表格清晰，我们保留它
    lr_specific_summary = df_lr_filtered.groupby(['Dataset', 'Method', 'Scale', 'LR']).agg(
        Mean_AUC=('AUC', 'mean'),
        Std_AUC=('AUC', 'std'),
        Mean_Logloss=('Logloss', 'mean'),
        Run_Count=('Seed', 'count') # 统计聚合了多少个种子
    ).reset_index()
    
    print(f"\n--- 聚合报告 (LR={TARGET_LR}, 按种子求平均) ---")
lr_specific_summary


--- 特定分析：仅针对 LR = 0.005 的结果进行聚合 ---

--- 聚合报告 (LR=0.005, 按种子求平均) ---


Unnamed: 0,Dataset,Method,Scale,LR,Mean_AUC,Std_AUC,Mean_Logloss,Run_Count
0,beauty,base,0,0.005,0.670467,0.000551,0.5068,3
1,beauty,me,1,0.005,0.676433,0.000907,0.503533,3
2,beauty,me,3,0.005,0.675167,0.000764,0.5043,3
3,beauty,me,7,0.005,0.664933,0.007328,0.599233,3
4,beauty,moc,1,0.005,0.676633,0.000503,0.5039,3
5,beauty,moc,3,0.005,0.6743,0.000557,0.5047,3
6,beauty,moc,7,0.005,0.670067,0.001137,0.506933,3
7,beauty,rq,1,0.005,0.676767,0.000208,0.5038,3
8,beauty,rq,3,0.005,0.6733,0.000954,0.505833,3
9,beauty,rq,7,0.005,0.6675,0.000458,0.508833,3


Unnamed: 0,Dataset,Method,Scale,LR,Mean_AUC,Std_AUC,Mean_Logloss,Run_Count
0,beauty,base,0,0.001,0.669867,0.001201,0.506833,3
1,beauty,base,0,0.005,0.670467,0.000551,0.5068,3
2,beauty,base,0,0.01,0.671267,0.000451,0.5062,3
3,beauty,me,1,0.001,0.6729,0.000436,0.505367,3
4,beauty,me,1,0.005,0.676433,0.000907,0.503533,3
5,beauty,me,1,0.01,0.676667,0.000503,0.503633,3
6,beauty,me,3,0.001,0.667133,0.00265,0.508233,3
7,beauty,me,3,0.005,0.675167,0.000764,0.5043,3
8,beauty,me,3,0.01,0.673567,0.000929,0.505333,3
9,beauty,me,7,0.001,0.654567,0.006658,0.5625,3
