In [16]:
%%capture
!pip install import_ipynb --no-cache
import import_ipynb

In [17]:
%%capture
m = __import__("Methods")

# Feature Voting
- Weighs the importance of features based on the frequency between models, Shapley values, and lowest RMSE

In [32]:
import pandas as pd
import numpy as np
import ast
from collections import defaultdict
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', 60)

### Dataset Cleaning

In [19]:
df = pd.read_csv("data/results/model_results_n3000.csv")

In [33]:
df

Unnamed: 0,Model,Target Variable,MSE,RMSE,MAE,R2,R,Selected Features,Sorted Shap Values
0,ANN_Final_Test,th_positive_cells,2652.076971,51.498320,28.483265,-0.809668,,"7SLRNA:CR32864,abd-A,Abd-B,Abl,abo,Ace,acj6,nAChRalpha1,...","{'Fbp1': 0.6175863081031662, 'gammaTry': 0.1568093957583..."
1,Linear_Regression_Final_Test,th_positive_cells,2042.892604,45.198369,25.071024,-0.393986,,"Cp38,cv-2,Dab,Dfd,disco,Delta,bsh,ea,Edg78E,Est-P,Taf9,e...","{'CG3819': 0.07599323966654625, 'Dfd': 0.055372169286853..."
2,Ridge_Final_Test,th_positive_cells,2030.433041,45.060327,24.990324,-0.385484,,"Fbp1,CG3819,GATAe,CG34002,CG5897,asRNA:CR44106,lncRNA:CR...","{'Fbp1': 0.03646496059075438, 'CG3819': 0.03262355306834..."
3,Lasso_Final_Test,th_positive_cells,1986.779792,44.573308,24.732177,-0.355697,,"7SLRNA:CR32864,abd-A,Abd-B,Abl,abo,Ace,acj6,nAChRalpha1,...","{'snk': 0.5626258972541195, 'lncRNA:CR45188': 0.18243525..."
4,SVM_Final_Test,th_positive_cells,2038.254701,45.147034,25.050882,-0.390821,,"Abd-B,acj6,Adh,al,amd,Argk1,Arr1,Arr2,ase,btd,by,c(3)G,C...","{'Fbp1': 0.032755632660914955, 'CG3819': 0.0297635502707..."
...,...,...,...,...,...,...,...,...,...
7,Linear_Regression_Final_Test,repo_glial_cells,50699.446595,225.165376,104.042219,-0.274177,,"Cp19,Cp36,cv-2,CycA,CycB,Cyt-b5-r,Dab,dec,Pkg21D,Dip-B,D...","{'CG7126': 0.03550376790631025, 'Cp36': 0.03255355884668..."
8,Ridge_Final_Test,repo_glial_cells,50730.989891,225.235410,104.083323,-0.274969,,"CG7126,CG43996,CG1387,CG14260,CR30409,asRNA:CR45371,CR43...","{'CG7126': 0.031388015131363314, 'CG43996': 0.0213343584..."
9,Lasso_Final_Test,repo_glial_cells,50522.362003,224.771800,103.872901,-0.269726,,"7SLRNA:CR32864,abd-A,Abd-B,Abl,abo,Ace,acj6,nAChRalpha1,...","{'ddbt': 0.43145103942503793, 'prd': 0.2957561177023954,..."
10,SVM_Final_Test,repo_glial_cells,50717.985908,225.206541,104.019568,-0.274643,,"7SLRNA:CR32864,Abd-B,Act57B,Gart,Adh,Adhr,al,amd,Amy-p,a...","{'djl': 0.035060921352132, 'Tengl3': 0.03152701489366444..."


In [34]:
repo_df = df[df["Target Variable"] == "th_positive_cells"].copy()
repo_df.drop(columns=["Selected Features"], inplace=True)
repo_df

Unnamed: 0,Model,Target Variable,MSE,RMSE,MAE,R2,R,Sorted Shap Values
0,ANN_Final_Test,th_positive_cells,2652.076971,51.49832,28.483265,-0.809668,,"{'Fbp1': 0.6175863081031662, 'gammaTry': 0.1568093957583..."
1,Linear_Regression_Final_Test,th_positive_cells,2042.892604,45.198369,25.071024,-0.393986,,"{'CG3819': 0.07599323966654625, 'Dfd': 0.055372169286853..."
2,Ridge_Final_Test,th_positive_cells,2030.433041,45.060327,24.990324,-0.385484,,"{'Fbp1': 0.03646496059075438, 'CG3819': 0.03262355306834..."
3,Lasso_Final_Test,th_positive_cells,1986.779792,44.573308,24.732177,-0.355697,,"{'snk': 0.5626258972541195, 'lncRNA:CR45188': 0.18243525..."
4,SVM_Final_Test,th_positive_cells,2038.254701,45.147034,25.050882,-0.390821,,"{'Fbp1': 0.032755632660914955, 'CG3819': 0.0297635502707..."
5,Random_Forest_Final_Test,th_positive_cells,2051.461866,45.293066,25.037196,-0.399833,,"{'wbl': 0.00345565416948028, 'lncRNA:CR45809': 0.0034556..."


### RMSE & SHAP Summary Statistics

In [22]:
# Prepare lists for SHAP values and RMSE
shap_values_list = []
rmse_list = []

for index, row in repo_df.iterrows():
    # Convert SHAP values from string to actual dictionary
    shap_values = ast.literal_eval(row['Sorted Shap Values'])
    
    # Extend the list with the SHAP values
    shap_values_list.extend(shap_values.values())

    # Append the RMSE value
    rmse_list.append(row['RMSE'])

# Convert lists to pandas Series for statistical summary
shap_values_series = pd.Series(shap_values_list)
rmse_series = pd.Series(rmse_list)

# Display summary statistics
shap_stats = shap_values_series.describe()
rmse_stats = rmse_series.describe()

print("Summary Statistics for SHAP Values:")
print(shap_stats)
print("\nSummary Statistics for RMSE:")
print(rmse_stats)

Summary Statistics for SHAP Values:
count    18000....
mean      0.000733
std       0.006868
min       0.000000
25%       0.000000
50%       0.000055
75%       0.000364
max       0.617586
dtype: float64

Summary Statistics for RMSE:
count     6.000000
mean     46.128404
std       2.642762
min      44.573308
25%      45.082004
50%      45.172702
75%      45.269392
max      51.498320
dtype: float64


### Ensemble Feature Voting Implementation

In [23]:
from collections import defaultdict
import pandas as pd
import ast

# Create dictionaries to store feature importance and frequency
feature_importance = defaultdict(float)
feature_frequency = defaultdict(float) 

# Loop through the DataFrame and calculate importance
for index, row in repo_df.iterrows():
    shap_values = row['Sorted Shap Values']
    
    # Convert SHAP values from string to dictionary
    shap_values = ast.literal_eval(shap_values)
    rmse = row['RMSE']
    r2 = row['R2']
    
    # Weight adjustment by (1 + R²) to emphasize models with higher R² values
    weight_adjustment = (1 + r2)  # Adding 1 to ensure the weight increases with higher R²

    # scale the values, r^2 & rmse are different. weigh the performance metrics different, shap should be twice as important as the other
    for feature, shap_value in shap_values.items():
        if shap_value != 0:  # Only consider non-zero SHAP values
            # Update the feature importance based on SHAP values and R² weight
            feature_importance[feature] += shap_value * weight_adjustment
            
            # Increment the frequency for the feature, weighted by the model's RMSE and R²
            feature_frequency[feature] += (1 / rmse) * weight_adjustment  # Lower RMSE increases frequency weight

# Adjust final importance by multiplying by the RMSE and R²-weighted frequency
for feature in feature_importance:
    feature_importance[feature] *= feature_frequency[feature]

# Sort features by their adjusted importance scores
sorted_features = sorted(feature_importance.items(), key=lambda item: item[1], reverse=True)

# Create a DataFrame for the top 20 features
df_sorted_features = pd.DataFrame(sorted_features, columns=['Gene', 'Weight'])

# Display the top 20 important features
df_sorted_features.head(20)

df_sorted_features.to_csv("data/results/n3000_repo_feature_importance.csv", index=False)

In [24]:
df_sorted_features.head(20)

Unnamed: 0,Gene,Weight
0,Fbp1,0.007971
1,snk,0.006641
2,CG3819,0.003415
3,lncRNA...,0.002181
4,CG34002,0.001728
...,...,...
15,CG18480,0.000827
16,lncRNA...,0.000798
17,asRNA:...,0.000781
18,Ir7c,0.000761


# Gene Set Analysis

In [25]:
# look at the names of the libraries that are available to use for PEA and GSEA
gene_sets = m.gp.get_library_name(organism="fly")

gene_sets.remove("Allele_Phenotypes_from_FlyBase_2017")

gene_list = df_sorted_features[:20]['Gene'].tolist()

In [26]:
# Perform GO enrichment analysis with FlyBase genes using Enrichr
enr = m.gp.enrichr(
    gene_list=gene_list,  # List of FlyBase gene symbols
    gene_sets=gene_sets,
    organism='fly',  # Set to 'fly' for Drosophila (FlyBase genes)
    outdir=None
)

enr.results[enr.results['Adjusted P-value'] < 0.05][['Term', 'Genes', 'Adjusted P-value']]

Unnamed: 0,Term,Genes,Adjusted P-value
0,subeso...,Dfd,0.038815
1,embryo...,Dfd,0.038815
2,intest...,GATAe,0.038815
3,embryo...,Dfd,0.038815
4,segmen...,Dab,0.038815
...,...,...,...
1188,Hemocy...,Fbp1,0.006980
1189,Trypsin,snk;ga...,0.001768
1190,Hemocy...,Fbp1,0.006980
1191,PAX,prd,0.006980


In [27]:
results = enr.results[enr.results['Adjusted P-value'] < 0.05][['Term', 'Genes', 'Adjusted P-value']].sort_values(by=['Adjusted P-value'], ascending=False)

In [28]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
results

Unnamed: 0,Term,Genes,Adjusted P-value
406,"maternal specification of dorsal/ventral axis, oocyte, germ-line encoded (GO:0007311)",snk,0.048905
407,enteroendocrine cell differentiation (GO:0035883),GATAe,0.048905
415,anterior/posterior pattern specification (GO:0009952),prd;Dfd,0.048905
429,positive regulation of transcription from RNA polymerase II promoter (GO:0045944),prd;Dfd;GATAe,0.048905
434,"positive regulation of transcription, DNA-templated (GO:0045893)",prd;Dfd;GATAe,0.048905
403,endoderm development (GO:0007492),GATAe,0.048905
404,glandular epithelial cell differentiation (GO:0002067),GATAe,0.048905
409,positive regulation of apoptotic process involved in development (GO:1904747),Dfd,0.048905
408,digestive system development (GO:0055123),GATAe,0.048905
410,periodic partitioning by pair rule gene (GO:0007366),prd,0.048905


# Pathway Analysis Prep

In [29]:
symbols = df_sorted_features["Gene"].to_list()

import pandas as pd

# Load the conversion DataFrame
conversion_df = pd.read_csv("conversion_df.csv")

# Assuming the columns in conversion_df are named 'GeneSymbol' and 'FlyBaseID'
# If they have different names, adjust accordingly.
conversion_dict = dict(zip(conversion_df['Gene Symbol'], conversion_df['FB ID']))

# Assuming `symbols` contains the gene symbols
flybase_ids = [conversion_dict.get(symbol) for symbol in symbols]

# Now `flybase_ids` contains the FlyBase IDs corresponding to the gene symbols.
flybase_ids[:20]

['FBgn0000639',
 'FBgn0003450',
 'FBgn0036833',
 'FBgn0266698',
 'FBgn0054002',
 'FBgn0286834',
 'FBgn0000439',
 'FBgn0036220',
 'FBgn0003145',
 'FBgn0264915',
 'FBgn0000414',
 'FBgn0266436',
 'FBgn0038391',
 'FBgn0265982',
 'FBgn0034563',
 'FBgn0028518',
 'FBgn0047092',
 'FBgn0266628',
 'FBgn0029966',
 'FBgn0010359']

In [30]:
# Specify the output file name
output_file = "background_genes.txt"

# Write the FlyBase IDs to the text file
with open(output_file, 'w') as file:
    for fb_id in conversion_df["FB ID"]:
        file.write(f"{fb_id}\n")

print(f"FlyBase IDs have been written to {output_file}")


FlyBase IDs have been written to background_genes.txt


In [31]:
# Specify the output file name
output_file = "th_selected_genes.txt"

# Write the FlyBase IDs to the text file
with open(output_file, 'w') as file:
    for fb_id in flybase_ids:
        file.write(f"{fb_id}\n")

print(f"FlyBase IDs have been written to {output_file}")


FlyBase IDs have been written to repo_selected_genes.txt
