<a href="https://colab.research.google.com/github/Dowell-Lab/psea/blob/main/notebook_example/one_comorbid_many_gene.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages we will use

In [48]:
import pandas as pd
import plotly.express as px
from scipy.stats import zscore


# read in the files we will look at

In [2]:
url="https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/psea_scores_20240923-152820.adjpval.csv"
df = pd.read_csv(url, index_col=0)
url2="https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/genes.csv"
genedf = pd.read_csv(url2, index_col=0)

In [3]:
genedf["value"]=genedf["gene_id"]

In [4]:
#this brings in the medical disorders for all the individuals with Trisomy 21 in the Human Trisome Project
url = "https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/comorbid_file.csv"
comorbid_df = pd.read_csv(url, index_col=0)



In [5]:
#This brings in the normalized counts for all the individuals with Trisomy 21 in the Human Trisome Project. These are not the real count data but are similar to reall count data.
#df=pd.read_csv('/content/drive/MyDrive/normcounts.csv')
url_counts="https://media.githubusercontent.com/media/Dowell-Lab/psea/refs/heads/main/testdata/value_expression_large.csv"
gene_exp_df=pd.read_csv(url_counts, index_col=0)



# Create a data frame of each gene linked to a comorbid

In [98]:
def acomorbidanditsgenes(acomorbid, gene_exp_df, comorbid_df, df, genedf, cuttoff=0.1, adjpvalcol="p_value_BenjaminiHochberg"):
  df_nona = df.dropna()
  df_nona_threshold = df_nona[df_nona[adjpvalcol]<cuttoff]
  df_nona_threshold_names = df_nona_threshold.merge(genedf, how ="inner", on="value")
  aconditiondf = df_nona_threshold_names[df_nona_threshold_names["binary_attribute"]==acondition]
  aconditiondf = aconditiondf.sort_values(["NES"])
  geneensmbllist = sorted(aconditiondf["value"].to_list())
  patientgenesexpdf = gene_exp_df[geneensmbllist+["Patient"]]
  onecomborbiddf = comorbid_df[[acomorbid, "Patient"]]
  merge_df = onecomborbiddf.merge(patientgenesexpdf, on="Patient")
  return merge_df


def splitdf_high_low_genes(thisdf, acondition, df,cuttoff=0.1, adjpvalcol="p_value_BenjaminiHochberg"):
  df_nona = df.dropna()
  df_nona_threshold = df_nona[df_nona[adjpvalcol]<cuttoff]
  df_nona_threshold_names = df_nona_threshold.merge(genedf, how ="inner", on="value")
  aconditiondf = df_nona_threshold_names[df_nona_threshold_names["binary_attribute"]==acondition]
  aconditiondf = aconditiondf.sort_values(["NES"])
  highgenes = aconditiondf[aconditiondf["NES"]<0]["value"].to_list()
  lowgenes = aconditiondf[aconditiondf["NES"]>0]["value"].to_list()
  high_merge_df = thisdf[[acondition, "Patient"]+highgenes]
  low_merge_df = thisdf[[acondition, "Patient"]+lowgenes]
  return high_merge_df, low_merge_df

def splitpeoplewithandwithout(acomorbid, adfofexpressionandcomorbid):
  withcomorid = adfofexpressionandcomorbid[adfofexpressionandcomorbid[acomorbid]==1].copy()
  without_comorid = adfofexpressionandcomorbid[adfofexpressionandcomorbid[acomorbid]==0].copy()
  withcomorid = withcomorid.drop(columns=["Patient", acomorbid])
  without_comorid = without_comorid.drop(columns=["Patient", acomorbid])
  return withcomorid, without_comorid

def zscoregenes(acomorbid, genexpconditiondf):
    dontuse_columns = ["Patient", acomorbid]
    gene_names = [thiscolname for thiscolname in genexpconditiondf.columns if thiscolname not in dontuse_columns]
    #df.apply(zscore)
    transpose_df = genexpconditiondf.drop(columns=dontuse_columns)
    transpose_df = transpose_df.apply(zscore)
    transpose_df["Patient"] = genexpconditiondf["Patient"]
    transpose_df[acomorbid] = genexpconditiondf[acomorbid]
    transpose_df = transpose_df[["Patient", acomorbid]+gene_names]
    return transpose_df


# Pick a binary_attribute to look at

In [185]:
#acondition = "constipation_disorder"
#acondition = "strabismus"
acondition="eustachian_tube_disorder"
genexpconditiondf = acomorbidanditsgenes(acondition,gene_exp_df, comorbid_df, df, genedf)
zscore_genexpconditiondf = zscoregenes(acondition, genexpconditiondf)
zscore_genexpconditiondf_high, zscore_genexpconditiondf_low = splitdf_high_low_genes(zscore_genexpconditiondf, acondition, df)
lookatdf = zscore_genexpconditiondf_high
dontuse_columns = ["Patient", acondition]
gene_names = [thiscolname for thiscolname in lookatdf.columns if thiscolname not in dontuse_columns]
lookatdf = lookatdf.sort_values(gene_names[0])



In [186]:
from sklearn import cluster


#I should really be scaling useing scaled_df = StandardScaler().fit_transform(df)
#But that would take a while, so I'm using the zscored one...
#problem is the genes with the highest varience drive the order then

#data = lookatdf.drop(columns = ["Patient", acondition])
#k_means = cluster.KMeans(n_clusters=5, max_iter=50, random_state=1)
#k_means.fit(data)
#labels = k_means.labels_
#whichcluster = pd.DataFrame(labels, index=lookatdf.Patient, columns=['Cluster_ID'])
#whichcluster.reset_index(inplace=True)
#lookatdf = lookatdf.merge(whichcluster, on="Patient")
#lookatdf = lookatdf.sort_values(["Cluster_ID"])
#lookatdf = lookatdf.drop(columns=["Cluster_ID"])
wco, woco = splitpeoplewithandwithout(acondition, lookatdf)
wco.reset_index(inplace=True, drop=True)
woco.reset_index(inplace=True, drop=True)

In [187]:

fig = px.imshow(wco)
fig.update_traces(zmax=6, zmin=-6)
fig.show()

In [188]:
woco = woco.sort_values(gene_names[0])
fig = px.imshow(woco)
fig.update_traces(zmax=6, zmin=-6)
fig.show()