<a href="https://colab.research.google.com/github/reza610/psea/blob/main/Copy_of_PSEA_metrics_on_simulated_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn import metrics

# Pull in the raw data

# I need to change this so the resultsdf is only used once to add the actaul labels and then the final_df is made and all code uses the final_df

In [None]:
url="https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/sim_psea_scores_20241015-122448.adjpval.csv"
resultsdf = pd.read_csv(url, index_col=0)


In [None]:
#which_pvalue_column = "p_value_BenjaminiHochberg"
#which_pvalue_column = "p_value_bonf"
#which_pvalue_column = "p_value_BenjaminiYekutieli"
which_pvalue_column = "p_value_holm"


# Label the simulated comorbidites as as TRUE or FALSE

## functions

In [None]:
#this code splits the column name to tell us which gene it was simulated from and what parmaters were used
def parse_simulated_binary_att(row):
  simulated_binary_attribute = row["binary_attribute"]
  # Split the simulated_binary_attribute string by underscores
  parts = simulated_binary_attribute.split('_')
  # Extract the gene name
  genename = "_".join(parts[0:4])
  # Extract the other values using a dictionary for easier parsing
  extracted_values = {}
  for part in parts[1:]:
      if 'Truesamplesize' in part:
          extracted_values['samples_true'] = int(part.replace('Truesamplesize', ''))
      elif 'biassamplesize' in part:
          extracted_values['samples_true_bias'] = int(part.replace('biassamplesize', ''))
      elif 'Zscorevaluebais' in part:
          extracted_values['Zscore_valuebais'] = float(part.replace('Zscorevaluebais', ''))
      elif 'sigma' in part:
          extracted_values['Zscore_valuebais_sigma'] = float(part.replace('sigma', ''))
      elif 'top' in part:
          extracted_values['top_or_bottom'] = "top"
      elif 'bottom' in part:
          extracted_values['top_or_bottom'] = "bottom"
      elif 'pba' in part:
          extracted_values['percent_binary_attributes_thatarevaluebias'] = float(part.replace('pba', ''))

  return genename, extracted_values


def mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=0):
  """ min_people_with_bias_transcription is a number. More than min_people_with_bias_transcription must have the bias for it to count as a TRUE gene-comorbid linkage"""
  #split the comorbid name into all its parts
  df[["genename", "other_dict"]] = df.apply(lambda row: parse_simulated_binary_att(row), axis=1, result_type="expand")
  #put its parts (now new columns) back on the orginal data frame
  final_df = pd.concat([resultsdf.drop(['other_dict'], axis=1), resultsdf['other_dict'].apply(pd.Series)], axis=1)
  final_df['Actual_Label'] = False  # Initialize all values to 'FP'
  # Create a boolean mask for rows where 'value' matches 'genename' and 'samples_true_bias' is not 0
  mask = (final_df['value'] == final_df['genename']) & (final_df['samples_true_bias'] > min_people_with_bias_transcription)
  # Set 'Actual_Label' to 'TRUE' for rows matching the mask
  final_df.loc[mask, 'Actual_Label'] = True
  final_df["one_minus_which_pvalue_column"] = 1 - final_df[which_pvalue_column]
  return final_df

# functions for creating ROC curve and confusion matrix

In [None]:
 def confusion_matrix(df, cutoff=0.05, printconfusion=False):
  confusiondf = df[[which_pvalue_column, "Actual_Label"]].copy()
  confusiondf["Predicted_Label"] = np.where(confusiondf[which_pvalue_column] <= cutoff, "TRUE", "FALSE")
  cm = confusiondf.groupby(["Actual_Label", "Predicted_Label"]).size().reset_index(name="count")
  cm['Confusion_Category'] = 'Unknown'
  cm.loc[(cm.Actual_Label == 'FALSE') & (cm.Predicted_Label == 'TRUE'), 'Confusion_Category'] = 'False Positive'
  cm.loc[(cm.Actual_Label == 'TRUE') & (cm.Predicted_Label == 'TRUE'), 'Confusion_Category'] = 'True Positive'
  cm.loc[(cm.Actual_Label == 'TRUE') & (cm.Predicted_Label == 'FALSE'), 'Confusion_Category'] = 'False Negative'
  cm.loc[(cm.Actual_Label == 'FALSE') & (cm.Predicted_Label == 'FALSE'), 'Confusion_Category'] = 'True Negative'
  try:
    TP = cm.loc[cm.Confusion_Category == 'True Positive', 'count'].values[0]
  except:
    TP = 0
  try:
    FN = cm.loc[cm.Confusion_Category == 'False Negative', 'count'].values[0]
  except:
    FN = 0
  try:
    FP = cm.loc[cm.Confusion_Category == 'False Positive', 'count'].values[0]
  except:
    FP = 0
  try:
    TN = cm.loc[cm.Confusion_Category == 'True Negative', 'count'].values[0]
  except:
    TN = 0
  if TP + FN == 0:
    TPrate = 0
  else:
    TPrate = TP/(TP+FN)
  if FP + TN == 0:
    FPrate = 0
  else:
    FPrate = FP/(FP+TN)
  if printconfusion==True:
    if cutoff==0.05:
      cmsummary = pd.DataFrame([[TP,FN],[FP,TN]])
      cmsummary.columns = ["Predicted_TRUE", "Predicted_FALSE"]
      cmsummary.index = ["Actual_TRUE", "Actual_FALSE"]
      print("cutoff", cutoff)
      print(cmsummary)
  return cm, TPrate, FPrate

def set_pval_cutoffs_create_ROC_curve(df):
  TPrates = []
  FPrates = []
  cutoffs = [cutoff for cutoff in np.arange(0, 1.01, 0.01)]
  for cutoff in cutoffs:
    cm, TPrate, FPrate = confusion_matrix(df, cutoff=cutoff)
    TPrates.append(TPrate)
    FPrates.append(FPrate)
  ROCdf = pd.DataFrame({"TPrate": TPrates, "FPrate": FPrates, "cutoff": cutoffs})
  return ROCdf

def create_ROC_curve(df):
  df["one_minus_which_pvalue_column"] = 1 - df[which_pvalue_column]
  fpr, tpr, thresholds = metrics.roc_curve(df["Actual_Label"], df["one_minus_which_pvalue_column"])
  ROCdf = pd.DataFrame({"TPrate": tpr, "FPrate": fpr, "thresholds":thresholds})
  return ROCdf

def plot_ROC_curve(ROCdf, graphtitle="ROCcurve"):
  fig = px.line(ROCdf, x="FPrate", y="TPrate", title=graphtitle)
  fig.update_layout(xaxis_range=[0, 1], yaxis_range=[0, 1])
  fig.update_xaxes(title_text="False Positive Rate")
  fig.update_yaxes(title_text="True Positive Rate")
  fig.show()


def plot_ROCdf_single_set_parmaters(df, min_people_with_bias_transcription=0):
    # Filter the dataframe based on min_people_with_bias_transcription if needed
    filtered_df = df[df['samples_true_bias'] > min_people_with_bias_transcription]

    # Proceed to create ROC curve on filtered data
    ROCdf = create_ROC_curve(filtered_df)
    plot_ROC_curve(ROCdf)


def get_unique_sample_sets(df):
  """
  Returns a list of unique combinations of 'samples_true' and 'samples_true_bias'
  found in the dataframe.
  """
  unique_sets = df.groupby(['samples_true', 'samples_true_bias'])['genename'].count().reset_index()
  return unique_sets[['samples_true', 'samples_true_bias']].drop_duplicates().values.tolist()



In [None]:
final_df = mark_actul_TRUE_FALSE_links(resultsdf)

In [None]:
final_df["Actual_Label"]

Unnamed: 0,Actual_Label
49999,True
39711,True
39700,True
39599,True
39533,True
...,...
30102,False
4175,False
20197,False
35,False


In [None]:
fig = px.histogram(final_df, x="pval")
fig.show()

In [None]:
plot_ROCdf_single_set_parmaters(final_df_filtered, min_people_with_bias_transcription=10)


In [None]:
def paramater_sets_avalable(df,min_people_with_bias_transcription=0):
  final_df = mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=min_people_with_bias_transcription)
  #samples_true 	samples_true_bias 	Zscore_valuebais 	Zscore_valuebais_sigma 	top_or_bottom 	percent_binary_attributes_thatarevaluebias
  unique_sample_sets = get_unique_sample_sets(final_df)
  n_comorbids = [n_comorbid for n_comorbid, n_comorbids_bias in sorted(unique_sample_sets) if n_comorbids_bias!=0]
  n_comorbids_biases = [n_comorbids_bias for n_comorbid, n_comorbids_bias in sorted(unique_sample_sets) if n_comorbids_bias!=0]
  Zscore_valuesbaises = [v for v in sorted(final_df["Zscore_valuebais"].unique())]
  toporbottom = [v for v in sorted(final_df["top_or_bottom"].unique())]
  print ("[[n_comorbid,n_comorbid_and_bias],[n_comorbid,n_comorbid_and_bias]]")
  print (unique_sample_sets)
  print("Zscore_valuesbaises", Zscore_valuesbaises)
  print("toporbottom", toporbottom)

def draw_ROCcurve_set_of_parameters(df, min_people_with_bias_transcription=0, n_comorbid=141, n_comorbid_bias=113, Zscore_valuebais=1.0, direction="top"):
  final_df = mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=min_people_with_bias_transcription)
  final_df_subset = final_df[(final_df["samples_true"] == n_comorbid) & (final_df["samples_true_bias"] == n_comorbid_bias) & (final_df["Zscore_valuebais"] == Zscore_valuebais)& (final_df["top_or_bottom"] == direction)]
  if final_df_subset.shape[0] != 0:
        direction = "low expression" if direction == "top" else "high expression"
        title = f"n_comorbid: {n_comorbid}, n_comorbid_bias: {n_comorbid_bias}, Zscore_valuebais: {Zscore_valuebais}, bais_direction: {direction}"
        print(title)
        ROCdf = create_ROC_curve(final_df_subset)
        plot_ROC_curve(ROCdf, graphtitle=title)

def draw_ROCcurve_to_evaluate_simulation_parameters_neccesary(df, min_people_with_bias_transcription=0):
  final_df = mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=min_people_with_bias_transcription)
  #samples_true 	samples_true_bias 	Zscore_valuebais 	Zscore_valuebais_sigma 	top_or_bottom 	percent_binary_attributes_thatarevaluebias
  unique_sample_sets = get_unique_sample_sets(final_df)
  n_comorbids = [n_comorbid for n_comorbid, n_comorbids_bias in sorted(unique_sample_sets) if n_comorbids_bias!=0]
  n_comorbids_biases = [n_comorbids_bias for n_comorbid, n_comorbids_bias in sorted(unique_sample_sets) if n_comorbids_bias!=0]
  Zscore_valuesbaises = [v for v in sorted(final_df["Zscore_valuebais"].unique())]
  toporbottom = [v for v in sorted(final_df["top_or_bottom"].unique())]
  for n_comorbid in n_comorbids:
    for n_comorbid_bias in n_comorbids_biases:
      for Zscore_valuebais in Zscore_valuesbaises:
        for direction in toporbottom:
          final_df_subset = final_df[(final_df["samples_true"] == n_comorbid) & (final_df["samples_true_bias"] == n_comorbid_bias) & (final_df["Zscore_valuebais"] == Zscore_valuebais)& (final_df["top_or_bottom"] == direction)]
          if final_df_subset.shape[0] != 0:
            direction = "low expression" if direction == "top" else "high expression"
            title = f"n_comorbid: {n_comorbid}, n_comorbid_bias: {n_comorbid_bias}, Zscore_valuebais: {Zscore_valuebais}, bais_direction: {direction}"
            print(title)
            ROCdf = create_ROC_curve(final_df_subset)
            plot_ROC_curve(ROCdf, graphtitle=title)

In [None]:

def AUC_ROCcurve_to_evaluate_simulation_parameters_neccesary(df, min_people_with_bias_transcription=0):
  final_df = mark_actul_TRUE_FALSE_links(df, min_people_with_bias_transcription=min_people_with_bias_transcription)
  #samples_true 	samples_true_bias 	Zscore_valuebais 	Zscore_valuebais_sigma 	top_or_bottom 	percent_binary_attributes_thatarevaluebias
  unique_sample_sets = get_unique_sample_sets(final_df)
  n_comorbids = [n_comorbid for n_comorbid, n_comorbids_bias in sorted(unique_sample_sets) if n_comorbids_bias!=0]
  n_comorbids_biases = [n_comorbids_bias for n_comorbid, n_comorbids_bias in sorted(unique_sample_sets) if n_comorbids_bias!=0]
  Zscore_valuesbaises = [v for v in sorted(final_df["Zscore_valuebais"].unique())]
  toporbottom = [v for v in sorted(final_df["top_or_bottom"].unique())]
  lines = []
  for n_comorbid in n_comorbids:
    for n_comorbid_bias in n_comorbids_biases:
      for Zscore_valuebais in Zscore_valuesbaises:
        for direction in toporbottom:
          final_df_subset = final_df[(final_df["samples_true"] == n_comorbid) & (final_df["samples_true_bias"] == n_comorbid_bias) & (final_df["Zscore_valuebais"] == Zscore_valuebais)& (final_df["top_or_bottom"] == direction)]
          if final_df_subset.shape[0] != 0:
            direction = "low expression" if direction == "top" else "high expression"
            auc_score = roc_auc_score(final_df_subset["Actual_Label"], final_df_subset["one_minus_which_pvalue_column"])
            line = [n_comorbid,n_comorbid_bias, Zscore_valuebais, direction, auc_score]
            lines.append(line)
  aucdf = pd.DataFrame(lines, columns=["samples_true", "samples_true_bias", "Zscore_valuebais", "top_or_bottom", "AUC"])
  return aucdf

In [None]:
paramater_sets_avalable(resultsdf)

[[n_comorbid,n_comorbid_and_bias],[n_comorbid,n_comorbid_and_bias]]
[[1, 0], [1, 1], [4, 0], [4, 1], [4, 2], [4, 3], [6, 0], [6, 1], [6, 2], [6, 4], [6, 5], [14, 0], [14, 3], [14, 6], [14, 8], [14, 11], [19, 0], [19, 4], [19, 8], [19, 11], [19, 15], [37, 0], [37, 7], [37, 15], [37, 22], [37, 30], [38, 0], [38, 8], [38, 15], [38, 23], [38, 30], [43, 0], [43, 9], [43, 17], [43, 26], [43, 34], [57, 0], [57, 11], [57, 23], [57, 34], [57, 46], [141, 0], [141, 28], [141, 56], [141, 85], [141, 113]]
Zscore_valuesbaises [1.0, 1.5, 2.0, 2.5, 3.0]
toporbottom ['bottom', 'top']


In [None]:
n_comorbid=37
n_comorbid_bias=15
Zscore_valuebais=1.0
min_people_with_bias_transcription=0
direction="top" # Top means the people with the lowest expression of this gene are bias for the comorbidity
draw_ROCcurve_set_of_parameters(resultsdf, min_people_with_bias_transcription=min_people_with_bias_transcription, n_comorbid=n_comorbid, n_comorbid_bias=n_comorbid_bias, Zscore_valuebais=Zscore_valuebais, direction=direction)


In [None]:
aucdf = AUC_ROCcurve_to_evaluate_simulation_parameters_neccesary(resultsdf)

In [None]:
#draw_ROCcurve_to_evaluate_simulation_parameters_neccesary(resultsdf)

In [None]:
aucdf

NameError: name 'aucdf' is not defined

In [None]:
fig = px.scatter(aucdf, x="samples_true_bias", y="samples_true", color = "AUC", facet_row="Zscore_valuebais", facet_col="top_or_bottom",color_continuous_scale="Bluered_r")
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.show()

This graph tells me that if there are less then ten people bias for a genes expression, we probably can't detect the gene-comorbid linke using PSEA. But even with 11 comorbid people biased for the expression of a gene, if the 11 people are more than 50% of the people with the comorbidity, we can find the link!

I think I will recommend to users that between 5% and 95% of the people have a comorbid to make sure PSEA can catch the link ( Assuming they have ~250 individuals as I do in the simulation).

In [None]:
final_df["log_"+which_pvalue_column]=np.log(final_df[which_pvalue_column])




divide by zero encountered in log



In [None]:
total_people_in_simulation=254
min_samples_true_bias = 0.05*total_people_in_simulation
max_samples_true_bias = 0.95*total_people_in_simulation

print ("min_samples_true_bias", min_samples_true_bias)
print ("max_samples_true", max_samples_true)
final_df_morethan5lessthan95 = final_df[(final_df["samples_true_bias"] > min_samples_true_bias) & (final_df["samples_true_bias"] < max_samples_true_bias)]
final_df_morethan5lessthan95

min_samples_true_bias 12.700000000000001


NameError: name 'max_samples_true' is not defined

In [None]:
total_people_in_simulation = 254
min_samples_true_bias = 0.05 * total_people_in_simulation
max_samples_true_bias = 0.95 * total_people_in_simulation

print("min_samples_true_bias:", min_samples_true_bias)
print("max_samples_true_bias:", max_samples_true_bias)

# Ensure 'samples_true_bias' column exists in the dataframe
if 'samples_true_bias' in final_df.columns:
    # Filtering to keep only rows where the samples_true_bias is between 5% and 95% of the total people
    final_df_filtered = final_df[
        (final_df["samples_true_bias"].notna()) &
        (final_df["samples_true_bias"] > min_samples_true_bias) &
        (final_df["samples_true_bias"] < max_samples_true_bias)
    ]
    # Display filtered dataframe
    print(final_df_filtered)
else:
    print("Error: Column 'samples_true_bias' not found in the DataFrame.")


min_samples_true_bias: 12.700000000000001
max_samples_true_bias: 241.29999999999998
                                        binary_attribute  \
49999  simulated_based_on_ENSG00000279648_Truesamples...   
39711  simulated_based_on_ENSG00000156273_Truesamples...   
39700  simulated_based_on_ENSG00000156265_Truesamples...   
39599  simulated_based_on_ENSG00000279648_Truesamples...   
39533  simulated_based_on_ENSG00000223692_Truesamples...   
...                                                  ...   
25163  simulated_based_on_ENSG00000264002_Truesamples...   
19823  simulated_based_on_ENSG00000170262_Truesamples...   
35220  simulated_based_on_ENSG00000170262_Truesamples...   
43147  simulated_based_on_ENSG00000237569_Truesamples...   
42891  simulated_based_on_ENSG00000279648_Truesamples...   

                                    value   runpsea           NES      pval  \
49999  simulated_based_on_ENSG00000279648  included  1.670112e+01  0.000000   
39711  simulated_based_on_ENSG0000015

In [None]:
plot_ROCdf_single_set_parmaters(final_df_filtered, min_people_with_bias_transcription=0)

TypeError: plot_ROCdf_single_set_parmaters() got an unexpected keyword argument 'min_people_with_bias_transcription'