In [14]:
%%capture
!pip install import_ipynb --no-cache
import import_ipynb

In [15]:
%%capture
m = __import__("Methods")

# Feature Voting
- Weighs the importance of features based on the frequency between models, Shapley values, and lowest RMSE

In [12]:
import pandas as pd
import numpy as np
import ast
from collections import defaultdict

### Dataset Cleaning

In [4]:
df = pd.read_csv("data/results/model_results_n3000.csv")

In [18]:
repo_df = df[df["Target Variable"] == "th_positive_cells"].copy()
repo_df.drop(columns=["Selected Features"], inplace=True)
repo_df

Unnamed: 0,Model,Target Variable,MSE,RMSE,MAE,R2,R,Sorted Shap Values
0,ANN_Final_Test,th_positive_cells,2652.076971,51.49832,28.483265,-0.809668,,"{'Fbp1': 0.6175863081031662, 'gammaTry': 0.156..."
1,Linear_Regression_Final_Test,th_positive_cells,2042.892604,45.198369,25.071024,-0.393986,,"{'CG3819': 0.07599323966654625, 'Dfd': 0.05537..."
2,Ridge_Final_Test,th_positive_cells,2030.433041,45.060327,24.990324,-0.385484,,"{'Fbp1': 0.03646496059075438, 'CG3819': 0.0326..."
3,Lasso_Final_Test,th_positive_cells,1986.779792,44.573308,24.732177,-0.355697,,"{'snk': 0.5626258972541195, 'lncRNA:CR45188': ..."
4,SVM_Final_Test,th_positive_cells,2038.254701,45.147034,25.050882,-0.390821,,"{'Fbp1': 0.032755632660914955, 'CG3819': 0.029..."
5,Random_Forest_Final_Test,th_positive_cells,2051.461866,45.293066,25.037196,-0.399833,,"{'wbl': 0.00345565416948028, 'lncRNA:CR45809':..."


### RMSE & SHAP Summary Statistics

In [19]:
# Prepare lists for SHAP values and RMSE
shap_values_list = []
rmse_list = []

for index, row in repo_df.iterrows():
    # Convert SHAP values from string to actual dictionary
    shap_values = ast.literal_eval(row['Sorted Shap Values'])
    
    # Extend the list with the SHAP values
    shap_values_list.extend(shap_values.values())

    # Append the RMSE value
    rmse_list.append(row['RMSE'])

# Convert lists to pandas Series for statistical summary
shap_values_series = pd.Series(shap_values_list)
rmse_series = pd.Series(rmse_list)

# Display summary statistics
shap_stats = shap_values_series.describe()
rmse_stats = rmse_series.describe()

print("Summary Statistics for SHAP Values:")
print(shap_stats)
print("\nSummary Statistics for RMSE:")
print(rmse_stats)

Summary Statistics for SHAP Values:
count    18000.000000
mean         0.000733
std          0.006868
min          0.000000
25%          0.000000
50%          0.000055
75%          0.000364
max          0.617586
dtype: float64

Summary Statistics for RMSE:
count     6.000000
mean     46.128404
std       2.642762
min      44.573308
25%      45.082004
50%      45.172702
75%      45.269392
max      51.498320
dtype: float64


### Ensemble Feature Voting Implementation

In [20]:
from collections import defaultdict
import pandas as pd
import ast

# Create dictionaries to store feature importance and frequency
feature_importance = defaultdict(float)
feature_frequency = defaultdict(float) 

# Loop through the DataFrame and calculate importance
for index, row in repo_df.iterrows():
    shap_values = row['Sorted Shap Values']
    
    # Convert SHAP values from string to dictionary
    shap_values = ast.literal_eval(shap_values)
    rmse = row['RMSE']
    r2 = row['R2']
    
    # Weight adjustment by (1 + R²) to emphasize models with higher R² values
    weight_adjustment = (1 + r2)  # Adding 1 to ensure the weight increases with higher R²

    for feature, shap_value in shap_values.items():
        if shap_value != 0:  # Only consider non-zero SHAP values
            # Update the feature importance based on SHAP values and R² weight
            feature_importance[feature] += shap_value * weight_adjustment
            
            # Increment the frequency for the feature, weighted by the model's RMSE and R²
            feature_frequency[feature] += (1 / rmse) * weight_adjustment  # Lower RMSE increases frequency weight

# Adjust final importance by multiplying by the RMSE and R²-weighted frequency
for feature in feature_importance:
    feature_importance[feature] *= feature_frequency[feature]

# Sort features by their adjusted importance scores
sorted_features = sorted(feature_importance.items(), key=lambda item: item[1], reverse=True)

# Create a DataFrame for the top 20 features
df_sorted_features = pd.DataFrame(sorted_features, columns=['Gene', 'Weight'])

# Display the top 20 important features
df_sorted_features.head(20)

df_sorted_features.to_csv("data/results/n3000_th_feature_importance.csv", index=False)

In [10]:
df_sorted_features.head(20)

Unnamed: 0,Gene,Weight
0,ddbt,0.003142
1,prd,0.001488
2,lncRNA:CR45937,0.001281
3,CG46388,0.000754
4,lncRNA:CR46084,0.000615
5,gammaTry,0.000529
6,CR43687,0.000484
7,Acp53Ea,0.000482
8,Fbp2,0.000473
9,deltaTry,0.000471


# Gene Set Analysis

In [21]:
# look at the names of the libraries that are available to use for PEA and GSEA
gene_sets = m.gp.get_library_name(organism="fly")

gene_sets.remove("Allele_Phenotypes_from_FlyBase_2017")

gene_list = df_sorted_features[:20]['Gene'].tolist()

In [22]:
# Perform GO enrichment analysis with FlyBase genes using Enrichr
enr = m.gp.enrichr(
    gene_list=gene_list,  # List of FlyBase gene symbols
    gene_sets=gene_sets,
    organism='fly',  # Set to 'fly' for Drosophila (FlyBase genes)
    outdir=None
)

enr.results[enr.results['Adjusted P-value'] < 0.05][['Term', 'Genes', 'Adjusted P-value']]

Unnamed: 0,Term,Genes,Adjusted P-value
0,subesophageal ganglion,Dfd,0.038815
1,embryonic mandibular segment,Dfd,0.038815
2,intestinal stem cell of posterior adult midgut...,GATAe,0.038815
3,embryonic maxillary segment,Dfd,0.038815
4,segmental nerve branch SNa of A1-7,Dab,0.038815
...,...,...,...
1188,Hemocyanin_C,Fbp1,0.006980
1189,Trypsin,snk;gammaTry,0.001768
1190,Hemocyanin_N,Fbp1,0.006980
1191,PAX,prd,0.006980


In [23]:
results = enr.results[enr.results['Adjusted P-value'] < 0.05][['Term', 'Genes', 'Adjusted P-value']].sort_values(by=['Adjusted P-value'], ascending=False)

In [26]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
results

Unnamed: 0,Term,Genes,Adjusted P-value
406,"maternal specification of dorsal/ventral axis, oocyte, germ-line encoded (GO:0007311)",snk,0.048905
407,enteroendocrine cell differentiation (GO:0035883),GATAe,0.048905
415,anterior/posterior pattern specification (GO:0009952),prd;Dfd,0.048905
429,positive regulation of transcription from RNA polymerase II promoter (GO:0045944),prd;Dfd;GATAe,0.048905
434,"positive regulation of transcription, DNA-templated (GO:0045893)",prd;Dfd;GATAe,0.048905
403,endoderm development (GO:0007492),GATAe,0.048905
404,glandular epithelial cell differentiation (GO:0002067),GATAe,0.048905
409,positive regulation of apoptotic process involved in development (GO:1904747),Dfd,0.048905
408,digestive system development (GO:0055123),GATAe,0.048905
410,periodic partitioning by pair rule gene (GO:0007366),prd,0.048905
