In [7]:
csvs = [
    "results/model/RR1_ann_baseline_20250618_040609/ann.csv",
    "results/model/RR1_nb_baseline_20250618_043432/naive_bayes.csv",
    "results/model/RR1_ridge_baseline_20250618_043002/ridge.csv"
]

top_features_df = vote_top_features(csv_paths=csvs, top_k=30)
print(top_features_df)

Top 30 features saved to: results/analysis/fv_RR1_rf_baseline_rf_20250618_120513.csv
                  Gene        Weight
0   ENSMUSG00000117874  6.115549e+06
1   ENSMUSG00000075318  4.941829e+06
2   ENSMUSG00000111709  4.358415e+06
3   ENSMUSG00000109564  4.084837e+06
4   ENSMUSG00000094686  4.009557e+06
5   ENSMUSG00000078901  3.937099e+06
6   ENSMUSG00000032323  3.879130e+06
7   ENSMUSG00000049436  3.792735e+06
8   ENSMUSG00000041794  3.772277e+06
9   ENSMUSG00000035373  3.749997e+06
10  ENSMUSG00000027379  3.636731e+06
11  ENSMUSG00000027500  3.608068e+06
12  ENSMUSG00000022613  3.576395e+06
13  ENSMUSG00000074183  3.537088e+06
14  ENSMUSG00000034918  3.516369e+06
15  ENSMUSG00000028587  3.510219e+06
16  ENSMUSG00000027871  3.499880e+06
17  ENSMUSG00000022878  3.490073e+06
18  ENSMUSG00000035352  3.485940e+06
19  ENSMUSG00000054083  3.472447e+06
20  ENSMUSG00000027070  3.436315e+06
21  ENSMUSG00000021697  3.420134e+06
22  ENSMUSG00000079494  3.408869e+06
23  ENSMUSG00000026255  3.3

In [3]:
import os
import json
import pandas as pd
import numpy as np
from datetime import datetime
from collections import defaultdict

def vote_top_features(csv_paths, config_path="config.json", top_k=20):
    # Load config
    with open(config_path) as f:
        config = json.load(f)

    experiment_name = config.get("experiment_name", "experiment")
    models_used = "_".join(config.get("models", []))
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    output_dir = f"results/analysis"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"fv_{experiment_name}_{models_used}_{timestamp}.csv")

    feature_importance = defaultdict(float)
    feature_frequency = defaultdict(float)

    dfs = [pd.read_csv(path) for path in csv_paths]
    all_df = pd.concat(dfs, ignore_index=True)

    rmse_values = all_df["rmse"].values
    r2_values = all_df["r2"].values
    rmse_min, rmse_max = rmse_values.min(), rmse_values.max()
    r2_min, r2_max = r2_values.min(), r2_values.max()

    all_df["rmse_scaled"] = (all_df["rmse"] - rmse_min) / (rmse_max - rmse_min)
    all_df["r2_scaled"] = (all_df["r2"] - r2_min) / (r2_max - r2_min)

    for _, row in all_df.iterrows():
        rmse_scaled = row["rmse_scaled"]
        r2_scaled = row["r2_scaled"]
        weight_adjustment = 1 + r2_scaled

        raw_features = row["top_features"].strip("; ")
        feature_dict = {
            pair.split(":")[0]: float(pair.split(":")[1])
            for pair in raw_features.split("; ")
            if ":" in pair
        }

        values = np.array(list(feature_dict.values()))
        if values.max() > values.min():
            feature_dict = {
                k: (v - values.min()) / (values.max() - values.min())
                for k, v in feature_dict.items()
            }

        for feature, score in feature_dict.items():
            if score > 0:
                feature_importance[feature] += score * weight_adjustment
                feature_frequency[feature] += (1 / (rmse_scaled + 1e-6)) * weight_adjustment

    for feature in feature_importance:
        feature_importance[feature] *= feature_frequency[feature]

    sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
    df_sorted = pd.DataFrame(sorted_features, columns=["Gene", "Weight"])

    df_sorted.head(top_k).to_csv(output_path, index=False)
    print(f"Top {top_k} features saved to: {output_path}")

    return df_sorted.head(top_k)
