In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc


import os
import plotly.express as px
import seaborn as sns
import math
import scipy
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import nan_euclidean_distances 
import itertools
from itertools import tee
from itertools import permutations


import scipy as sp
from scipy import stats
import researchpy as rp
import scipy.stats as stat

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [45]:
parent_dir = r'C:\Users\brsha\Thesis\Dynamic13AllSeed1001 - Copy\Dynamic13AllSeed1001\Tissues'
os.chdir(parent_dir)
number_of_tissues = 13
tissues = ['Muscle - Skeletal',
 'Whole Blood',
 'Skin - Sun Exposed (Lower leg)',
 'Skin - Not Sun Exposed (Suprapubic)',
 'Adipose - Subcutaneous',
 'Thyroid',
 'Artery - Tibial',
 'Nerve - Tibial',
 'Lung',
 'Brain - Cerebellum',
 'Heart - Atrial Appendage',
 'Brain - Cortex',
 'Adipose - Visceral (Omentum)']

        
def read_matrix(current_tissue): 
    df = pd.read_csv(f'{current_tissue}/sk_residual_matrix_w_const.csv')
    df.rename(columns = {'Unnamed: 0': 'Name'},inplace=True)
    df.set_index("Name", inplace = True)
    df = df.T
    df.columns = df.columns.str.split('.', n=1).str[0]
    df.index = df.index.str.split("-", n=0).str[:2].str.join(',').str.replace(',', '-')
    
    phenos= pd.read_csv(f"{current_tissue}/sk_samples_as_dummies.csv")
    phenos.set_index("SAMPID", inplace = True)
    phenos.index = phenos.index.str.split("-", n=0).str[:2].str.join(',').str.replace(',', '-')

    sampels_old = phenos[phenos['AGE'] >= 60]
    sampels_young = phenos[phenos['AGE'] < 60]
    
    df_old = df[df.index.isin(sampels_old.index)]
    df_young = df[df.index.isin(sampels_young.index)]
    
    print("Shape before filtering:")
    print(df_old.shape)
    print(df_young.shape)

    # ---------- Get the 5000 most varying genes from each dataset ------------
    top_genes_old = df_old.var().nlargest(6000).index
    top_genes_young = df_young.var().nlargest(6000).index

    print(f"top_genes_old_len: {len(top_genes_old)}")
    print(f"top_genes_young_len: {len(top_genes_young)}")
    
    # Find the union of the top genes from both datasets
    all_top_genes = list(set(top_genes_old).union(set(top_genes_young)))
    print(f"all_top_genes_len_union: {len(all_top_genes)}")
    all_top_genes = sorted(all_top_genes, reverse=True)[:5000]

    # Filter the datasets to include only the genes in the union
    df_old = df_old.loc[:, all_top_genes]
    df_young = df_young.loc[:, all_top_genes]
    
    output_dir = r"cross tissue network"
    df.to_csv(f"{output_dir}\{current_tissue}.csv")
    df_old.to_csv(f"{output_dir}\{current_tissue}_old.csv")
    df_young.to_csv(f"{output_dir}\{current_tissue}_young.csv")
    
    print("Final shape:")
    print(df.shape)
    print(df_old.shape)
    print(df_young.shape)
 
    
    return df, df_old, df_young


         

In [46]:
df_adipose_s, df_adipose_s_old, df_adipose_s_young = read_matrix('Adipose - Subcutaneous')
df_adipose_s_old.head()

Shape before filtering:
(76, 14344)
(96, 14344)
top_genes_old_len: 6000
top_genes_young_len: 6000
all_top_genes_len_union: 6508
Final shape:
(172, 14344)
(76, 5000)
(96, 5000)


Name,ENSG00000284413,ENSG00000284308,ENSG00000283992,ENSG00000283787,ENSG00000283632,ENSG00000282608,ENSG00000281991,ENSG00000280789,ENSG00000280670,ENSG00000280165,...,ENSG00000112936,ENSG00000112902,ENSG00000112837,ENSG00000112799,ENSG00000112796,ENSG00000112782,ENSG00000112769,ENSG00000112759,ENSG00000112715,ENSG00000112667
GTEX-1128S,-0.110103,-0.757219,-0.067958,-1.274352,-0.999052,0.340311,0.026438,-0.800089,0.573947,-0.473774,...,-0.194255,-0.656525,-0.785076,0.158092,-0.237533,-0.201268,-0.013963,0.285471,1.016602,-0.272135
GTEX-11EMC,0.044991,-0.027819,-0.562634,-0.255529,-0.553213,0.437061,0.163,-0.383047,0.12178,-0.26305,...,0.321365,-1.116053,-0.30682,-0.353837,-0.573389,-1.257124,-0.172392,0.277888,0.608995,-0.188549
GTEX-11GS4,0.047742,-0.286519,-0.192671,1.51969,-1.16622,-0.198085,0.162604,-0.054714,-1.722425,-1.011377,...,-0.267505,0.565383,-0.114642,1.005449,0.155961,-0.133287,0.361736,0.139483,-1.327722,0.149416
GTEX-11GSO,-0.524808,-0.059677,-0.922296,0.198466,-0.387238,1.315391,0.12943,-0.251479,-0.441783,-0.615459,...,0.762338,-0.061897,-0.589915,1.702784,-0.308173,-1.242456,0.334339,-0.142582,-0.438857,0.472824
GTEX-11OF3,-0.123673,0.772489,-0.225746,0.657082,0.864744,-0.7464,-0.133567,1.214273,0.942595,-0.339141,...,-0.987368,0.186162,0.558836,-0.148415,0.208471,1.719889,-0.21493,-0.05979,-0.38434,0.657523


In [47]:
df_muscle, df_muscle_old, df_muscle_young = read_matrix('Muscle - Skeletal')
df_muscle_old.head()

Shape before filtering:
(102, 12856)
(124, 12856)
top_genes_old_len: 6000
top_genes_young_len: 6000
all_top_genes_len_union: 6515
Final shape:
(226, 12856)
(102, 5000)
(124, 5000)


Name,ENSG00000283992,ENSG00000283787,ENSG00000283632,ENSG00000281991,ENSG00000280789,ENSG00000280670,ENSG00000278619,ENSG00000278615,ENSG00000278540,ENSG00000278535,...,ENSG00000111450,ENSG00000111432,ENSG00000111424,ENSG00000111404,ENSG00000111371,ENSG00000111348,ENSG00000111341,ENSG00000111335,ENSG00000111331,ENSG00000111328
GTEX-11EMC,0.270358,-1.300175,-0.10788,-0.239985,-0.56819,-0.264617,0.634262,0.665703,0.796564,0.016455,...,-0.32031,0.09752,0.897087,0.638693,2.789752,-0.062796,0.134244,-0.33631,-0.622577,-0.287855
GTEX-11GS4,0.042915,0.806677,-0.029612,0.020755,0.350695,-0.697681,0.464386,0.375897,-0.174915,0.401715,...,-0.403587,0.14315,-0.353073,-1.110382,-1.212798,0.084542,-1.703158,-0.059055,-0.083984,0.406307
GTEX-11GSO,-0.247363,-1.606877,-0.21552,-0.03647,-0.653207,-0.459352,-0.633183,0.319299,0.414465,0.068593,...,0.31269,-0.097579,0.022754,-0.315186,0.517609,-0.235479,-0.09817,-0.304415,-0.347427,0.431203
GTEX-11GSP,0.205789,0.903131,0.246273,0.163455,0.028226,0.440027,-0.720481,-0.302913,-0.279383,0.173491,...,0.201669,-0.464461,-0.107184,0.486947,-1.136001,0.194635,0.228225,-0.063139,0.011113,-0.305501
GTEX-11NV4,0.035356,1.514096,-0.599393,0.28479,0.349976,0.870116,-0.506867,0.011566,-0.579278,-0.17588,...,-0.26969,-0.576757,-0.208687,-0.724102,-2.10047,0.259556,-0.552251,0.115028,-0.049458,0.023966


In [48]:
df_brain_cortex, df_brain_cortex_old, df_brain_cortex_young= read_matrix('Brain - Cortex')
df_brain_cortex_old.head()

Shape before filtering:
(55, 14642)
(71, 14642)
top_genes_old_len: 6000
top_genes_young_len: 6000
all_top_genes_len_union: 6619
Final shape:
(126, 14642)
(55, 5000)
(71, 5000)


Name,ENSG00000284413,ENSG00000284308,ENSG00000283992,ENSG00000282608,ENSG00000281991,ENSG00000281406,ENSG00000280789,ENSG00000280670,ENSG00000280165,ENSG00000278615,...,ENSG00000115541,ENSG00000115523,ENSG00000115507,ENSG00000115468,ENSG00000115461,ENSG00000115457,ENSG00000115423,ENSG00000115419,ENSG00000115415,ENSG00000115414
GTEX-11DXY,-0.186512,-0.059531,0.953474,0.505226,-0.124623,0.0595,-0.040397,0.123433,0.160834,-0.040267,...,0.079499,-0.265127,0.072823,-0.284288,0.41636,0.836275,0.037315,0.627128,0.296404,0.278403
GTEX-11EMC,0.005669,-0.051326,-0.496006,-0.735277,0.340602,0.120122,0.592477,0.032036,-0.201954,-0.027215,...,0.053068,0.75369,0.369708,0.371022,-0.22101,-0.081935,-0.403708,-0.585227,0.053492,0.225035
GTEX-11GS4,-0.008583,0.228338,-0.534465,-0.014536,0.480919,-0.022812,0.072393,-0.055716,-0.203613,0.069106,...,-0.221284,0.605018,0.073307,0.88638,0.544255,-0.03421,0.123709,-0.457319,-0.413379,-0.573352
GTEX-11GSO,0.350592,-0.39328,0.172912,-0.102417,0.007811,0.014242,0.39966,0.247241,0.004596,0.437172,...,-0.136623,-0.601352,0.323738,0.442682,0.719036,-0.566151,-0.614322,-0.171383,-0.490201,-0.725102
GTEX-11GSP,-0.118514,-0.209684,-0.101046,-0.190859,0.169982,-0.255067,0.381074,0.843769,-0.361491,0.028378,...,0.676249,0.528579,0.550516,0.532704,0.311667,-1.184307,-0.2887,-0.619553,0.251181,0.3728


In [44]:

df_artery, df_artery_old, df_artery_young = read_matrix('Artery - Tibial')
df_blood, df_blood_old, df_blood_young = read_matrix('Whole Blood')
df_heart, df_heart_old, df_heart_young = read_matrix('Heart - Atrial Appendage')
df_nerve, df_nerve_old, df_nerve_young = read_matrix('Nerve - Tibial')
df_adipose_v, df_adipose_v_old, df_adipose_v_young= read_matrix('Adipose - Visceral (Omentum)')


df_brain_cortex_old.head()

Shape before filtering:
(75, 13925)
(96, 13925)
top_genes_old_len: 6000
top_genes_young_len: 6000
all_top_genes_len_union: 6554
Final shape:
(171, 13925)
(75, 6000)
(96, 6000)
Shape before filtering:
(97, 12048)
(112, 12048)
top_genes_old_len: 6000
top_genes_young_len: 6000
all_top_genes_len_union: 6471
Final shape:
(209, 12048)
(97, 6000)
(112, 6000)
Shape before filtering:
(62, 13839)
(73, 13839)
top_genes_old_len: 6000
top_genes_young_len: 6000
all_top_genes_len_union: 6565
Final shape:
(135, 13839)
(62, 6000)
(73, 6000)
Shape before filtering:
(71, 14904)
(89, 14904)
top_genes_old_len: 6000
top_genes_young_len: 6000
all_top_genes_len_union: 6565
Final shape:
(160, 14904)
(71, 6000)
(89, 6000)
Shape before filtering:
(60, 14474)
(69, 14474)
top_genes_old_len: 6000
top_genes_young_len: 6000
all_top_genes_len_union: 6556
Final shape:
(129, 14474)
(60, 6000)
(69, 6000)


Name,ENSG00000284413,ENSG00000284308,ENSG00000283992,ENSG00000282608,ENSG00000281991,ENSG00000281406,ENSG00000280789,ENSG00000280670,ENSG00000280165,ENSG00000278615,...,ENSG00000085382,ENSG00000085377,ENSG00000085365,ENSG00000085276,ENSG00000085117,ENSG00000085063,ENSG00000084754,ENSG00000084731,ENSG00000084636,ENSG00000084628
GTEX-11DXY,-0.186512,-0.059531,0.953474,0.505226,-0.124623,0.0595,-0.040397,0.123433,0.160834,-0.040267,...,0.0641,0.22067,0.693242,-0.181095,-0.092334,-0.125606,-0.519893,0.27869,-0.593743,0.57066
GTEX-11EMC,0.005669,-0.051326,-0.496006,-0.735277,0.340602,0.120122,0.592477,0.032036,-0.201954,-0.027215,...,-0.209616,-0.388861,-0.063911,0.362973,-0.133702,0.355706,0.438159,-0.60586,0.261539,-0.295942
GTEX-11GS4,-0.008583,0.228338,-0.534465,-0.014536,0.480919,-0.022812,0.072393,-0.055716,-0.203613,0.069106,...,-0.015775,-0.233664,-0.447943,-0.414394,0.887917,-0.323216,0.188394,-0.23229,0.733256,0.955137
GTEX-11GSO,0.350592,-0.39328,0.172912,-0.102417,0.007811,0.014242,0.39966,0.247241,0.004596,0.437172,...,0.417466,0.579926,-0.186151,0.528908,0.329236,0.232494,0.094575,-0.025761,-0.113372,0.208944
GTEX-11GSP,-0.118514,-0.209684,-0.101046,-0.190859,0.169982,-0.255067,0.381074,0.843769,-0.361491,0.028378,...,-0.471504,-0.479411,-0.177796,0.44202,0.362556,0.350014,0.312091,-0.878442,0.422407,-0.307805


In [33]:
# Define the datasets with sets of sample IDs
datasets = {
    'df_adipose_s': set(df_adipose_s.index),
    'df_muscle': set(df_muscle.index),
    'df_brain_cortex': set(df_brain_cortex.index),
    'df_artery': set(df_artery.index),
    'df_blood': set(df_blood.index),
    'df_heart': set(df_heart.index),
    'df_nerve': set(df_nerve.index),
    'df_adipose_v': set(df_adipose_v.index)
    
}

# Create an empty DataFrame to hold the table data
table = pd.DataFrame(columns=datasets.keys(), index=datasets.keys())

# Fill in the diagonal with the sample sizes
for dataset in datasets:
    table.loc[dataset, dataset] = len(datasets[dataset])

# Fill in the off-diagonal with the common sample sizes
for i, dataset1 in enumerate(datasets):
    for j, dataset2 in enumerate(datasets):
        if j > i:
            common_samples = len(datasets[dataset1].intersection(datasets[dataset2]))
            table.iloc[i, j] = common_samples
            table.iloc[j, i] = " "
            #table.iloc[j, i] = np.nan if common_samples == 0 else common_samples


table


Unnamed: 0,df_adipose_s,df_muscle,df_brain_cortex,df_artery,df_blood,df_heart,df_nerve,df_adipose_v
df_adipose_s,172.0,156.0,89.0,123.0,138.0,97.0,117.0,95
df_muscle,,226.0,113.0,155.0,184.0,125.0,149.0,111
df_brain_cortex,,,126.0,82.0,93.0,72.0,82.0,65
df_artery,,,,171.0,143.0,97.0,114.0,92
df_blood,,,,,209.0,109.0,133.0,109
df_heart,,,,,,135.0,93.0,78
df_nerve,,,,,,,160.0,86
df_adipose_v,,,,,,,,129


In [12]:
# Define the datasets with sets of sample IDs
datasets = {
    'adipose_s_old': set(df_adipose_s_old.index),
    'muscle_old': set(df_muscle_old.index),
    'brain_cortex_old': set(df_brain_cortex_old.index),
    'artery_old': set(df_artery_old.index),
    'blood_old': set(df_blood_old.index),
    'heart_old': set(df_heart_old.index),
    'nerve_old': set(df_nerve_old.index),
    'adipose_v_old': set(df_adipose_v_old.index)

    
}

# Create an empty DataFrame to hold the table data
table = pd.DataFrame(columns=datasets.keys(), index=datasets.keys())

# Fill in the diagonal with the sample sizes
for dataset in datasets:
    table.loc[dataset, dataset] = len(datasets[dataset])

# Fill in the off-diagonal with the common sample sizes
for i, dataset1 in enumerate(datasets):
    for j, dataset2 in enumerate(datasets):
        if j > i:
            common_samples = len(datasets[dataset1].intersection(datasets[dataset2]))
            table.iloc[i, j] = common_samples
            table.iloc[j, i] = " "
            #table.iloc[j, i] = np.nan if common_samples == 0 else common_samples

table


Unnamed: 0,adipose_s_old,muscle_old,brain_cortex_old,artery_old,blood_old,heart_old,nerve_old,adipose_v_old
adipose_s_old,76.0,71.0,36.0,50.0,64.0,43.0,51.0,44
muscle_old,,102.0,49.0,67.0,86.0,56.0,67.0,50
brain_cortex_old,,,55.0,30.0,43.0,31.0,35.0,31
artery_old,,,,75.0,64.0,41.0,47.0,42
blood_old,,,,,97.0,51.0,62.0,52
heart_old,,,,,,62.0,45.0,33
nerve_old,,,,,,,71.0,39
adipose_v_old,,,,,,,,60


In [13]:
# Define the datasets with sets of sample IDs
datasets = {
    'adipose_s_young': set(df_adipose_s_young.index),
    'muscle_young': set(df_muscle_young.index),
    'brain_cortex_young': set(df_brain_cortex_young.index),
    'artery_young': set(df_artery_young.index),
    'blood_young': set(df_blood_young.index),
    'heart_young': set(df_heart_young.index),
    'nerve_young': set(df_nerve_young.index),
    'adipose_v_young': set(df_adipose_v_young.index)

    
}

# Create an empty DataFrame to hold the table data
table = pd.DataFrame(columns=datasets.keys(), index=datasets.keys())

# Fill in the diagonal with the sample sizes
for dataset in datasets:
    table.loc[dataset, dataset] = len(datasets[dataset])

# Fill in the off-diagonal with the common sample sizes
for i, dataset1 in enumerate(datasets):
    for j, dataset2 in enumerate(datasets):
        if j > i:
            common_samples = len(datasets[dataset1].intersection(datasets[dataset2]))
            table.iloc[i, j] = common_samples
            table.iloc[j, i] = " "
            #table.iloc[j, i] = np.nan if common_samples == 0 else common_samples

table


Unnamed: 0,adipose_s_young,muscle_young,brain_cortex_young,artery_young,blood_young,heart_young,nerve_young,adipose_v_young
adipose_s_young,96.0,85.0,53.0,73.0,74.0,54.0,66.0,51
muscle_young,,124.0,64.0,88.0,98.0,69.0,82.0,61
brain_cortex_young,,,71.0,52.0,50.0,41.0,47.0,34
artery_young,,,,96.0,79.0,56.0,67.0,50
blood_young,,,,,112.0,58.0,71.0,57
heart_young,,,,,,73.0,48.0,45
nerve_young,,,,,,,89.0,47
adipose_v_young,,,,,,,,69
