# Feature Voting
- Weighs the importance of features based on the frequency between models, Shapley values, and lowest RMSE

In [66]:
import pandas as pd
import numpy as np
import ast
from collections import defaultdict

### Dataset Cleaning

In [67]:
df = pd.read_csv("data/results/n1500_update_results.csv")
rf_df = pd.read_csv("data/results/rf_results_n15000.csv")
rf_results = rf_df.iloc[[1]]

In [68]:
repo_df = df[df["Target Variable"] == "repo_glial_cells"].copy()
repo_df.iloc[3]["Top 20 SHAP Values"]
repo_df = repo_df.drop_duplicates(subset=["Model"])
repo_df = pd.concat([repo_df, rf_results], ignore_index=True)
repo_df.rename(columns={"Top 20 SHAP Values": "SHAP Values"}, inplace=True)
repo_df.drop(columns=["Selected Features"], inplace=True)
repo_df

Unnamed: 0,Model,Target Variable,MSE,RMSE,MAE,R2,R,SHAP Values
0,SVM_Final_Test,repo_glial_cells,50452.370024,224.61605,103.585149,-0.267967,,"{'CG8476': 0.03912131504773353, 'CG5681': 0.03..."
1,ANN_Final_Test,repo_glial_cells,50509.780284,224.74381,103.817923,-0.26941,,"{'lncRNA:CR46084': 0.07439023639314478, 'alpha..."
2,Linear_Regression_Final_Test,repo_glial_cells,50646.990268,225.048862,103.923537,-0.272858,,"{'CG1387': 0.05145064461810321, 'CG34002': 0.0..."
3,Ridge_Final_Test,repo_glial_cells,50639.497925,225.032215,103.89946,-0.27267,,"{'CG1387': 0.04993883728795643, 'CG34002': 0.0..."
4,Lasso_Final_Test,repo_glial_cells,48992.298861,221.34204,101.950948,-0.231273,,"[('Abd-B', 1.7060925591282226), ('7SLRNA:CR328..."
5,Random_Forest_Final_Test,repo_glial_cells,50450.015514,224.610809,103.684633,-0.267908,,"{'pie': 0.03872576910124749, 'alpha-Est6': 0.0..."


In [69]:
tuple_val = repo_df.iloc[4]["SHAP Values"]
dict_str = str(dict(ast.literal_eval(tuple_val)))

In [70]:
repo_df.at[4, "SHAP Values"] = dict_str
repo_df

Unnamed: 0,Model,Target Variable,MSE,RMSE,MAE,R2,R,SHAP Values
0,SVM_Final_Test,repo_glial_cells,50452.370024,224.61605,103.585149,-0.267967,,"{'CG8476': 0.03912131504773353, 'CG5681': 0.03..."
1,ANN_Final_Test,repo_glial_cells,50509.780284,224.74381,103.817923,-0.26941,,"{'lncRNA:CR46084': 0.07439023639314478, 'alpha..."
2,Linear_Regression_Final_Test,repo_glial_cells,50646.990268,225.048862,103.923537,-0.272858,,"{'CG1387': 0.05145064461810321, 'CG34002': 0.0..."
3,Ridge_Final_Test,repo_glial_cells,50639.497925,225.032215,103.89946,-0.27267,,"{'CG1387': 0.04993883728795643, 'CG34002': 0.0..."
4,Lasso_Final_Test,repo_glial_cells,48992.298861,221.34204,101.950948,-0.231273,,"{'Abd-B': 1.7060925591282226, '7SLRNA:CR32864'..."
5,Random_Forest_Final_Test,repo_glial_cells,50450.015514,224.610809,103.684633,-0.267908,,"{'pie': 0.03872576910124749, 'alpha-Est6': 0.0..."


### RMSE & SHAP Summary Statistics

In [71]:
import pandas as pd
import ast

# Prepare lists for SHAP values and RMSE
shap_values_list = []
rmse_list = []

for index, row in repo_df.iterrows():
    # Convert SHAP values from string to actual dictionary
    shap_values = ast.literal_eval(row['SHAP Values'])
    
    # Extend the list with the SHAP values
    shap_values_list.extend(shap_values.values())

    # Append the RMSE value
    rmse_list.append(row['RMSE'])

# Convert lists to pandas Series for statistical summary
shap_values_series = pd.Series(shap_values_list)
rmse_series = pd.Series(rmse_list)

# Display summary statistics
shap_stats = shap_values_series.describe()
rmse_stats = rmse_series.describe()

print("Summary Statistics for SHAP Values:")
print(shap_stats)
print("\nSummary Statistics for RMSE:")
print(rmse_stats)

Summary Statistics for SHAP Values:
count    9000.000000
mean        0.002087
std         0.018546
min         0.000000
25%         0.000000
50%         0.000056
75%         0.001619
max         1.706093
dtype: float64

Summary Statistics for RMSE:
count      6.000000
mean     224.232298
std        1.429155
min      221.342040
25%      224.612119
50%      224.679930
75%      224.960114
max      225.048862
dtype: float64


### Ensemble Feature Voting Implementation

In [73]:
import pandas as pd
import ast
from collections import defaultdict

feature_importance = defaultdict(float)

for index, row in repo_df.iterrows():
    shap_values = row['SHAP Values']
    
    # Convert SHAP values from string to actual data type
    shap_values = ast.literal_eval(shap_values)
    
    rmse = row['RMSE']

    # Normalize RMSE (lower RMSE = higher importance weight)
    rmse_weight = 1 / rmse

    # Iterate over the SHAP values of selected features
    for feature, shap_value in shap_values.items():
        # Update the feature importance by considering RMSE and SHAP values
        feature_importance[feature] += shap_value * rmse_weight

# Sort features by their importance scores
sorted_features = sorted(feature_importance.items(), key=lambda item: item[1], reverse=True)

# Create a dataframe with the sorted features
df_sorted_features = pd.DataFrame(sorted_features, columns=['Gene', 'Weight'])

df_sorted_features[:20]

Unnamed: 0,Gene,Weight
0,Abd-B,0.007708
1,CG34002,0.000521
2,CG5681,0.000488
3,CG8476,0.000476
4,Acp53Ea,0.000476
5,alpha-Est6,0.000469
6,CG1387,0.000451
7,CG6839,0.000405
8,snk,0.000398
9,CG18128,0.000375
