In [26]:
import sklearn
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import pandas as pd
import os, sys
import mlflow

from tqdm import tqdm

import ipywidgets as widgets
from ipywidgets import interact
from functools import partial

In [28]:
MLFLOW_TRACKING_URI = f"{os.environ['HOME']}/01_repos/CardiacMotion/mlruns/"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
z_filename = lambda exp_id, run_id: f"{MLFLOW_TRACKING_URI}/{exp_id}/{run_id}/artifacts/latent_vector.csv"

In [29]:
df = mlflow.search_runs(experiment_ids=[str(i) for i in range(3, 9)])
df = df[(df["metrics.val_rec_ratio_to_time_mean"] < 0.8) & (df["params.dataset_n_timeframes"] == '10')]

In [None]:
for i, row in tqdm(df.sort_values(["experiment_id", "metrics.val_rec_ratio_to_time_mean"]).iterrows()):        
    
    # print(row.experiment_id, row.run_id, row["metrics.val_rec_ratio_to_time_mean"])
    
    if row.experiment_id != "4":
        continue 
        
    zfn = z_filename(row.experiment_id, row.run_id)
    
    if not os.path.exists(zfn):
        continue
        # print(zfn)
        
    z_df = pd.read_csv(zfn) # .head(1000)
    z_df = z_df.set_index("ID")
    
    z_static = z_df.iloc[:,:8]
    z_dynamic = z_df.iloc[:,8:]
    
    for dim in [2,3]:
        for suffix, z in {"static": z_static, "dynamic": z_dynamic, "all": z_df}.items():
            
            tsne = TSNE(n_components=dim, learning_rate='auto', init='pca', )
            t = tsne.fit_transform(z)
            tvalues_df = pd.DataFrame(t)
            
            if dim == 2:
                tvalues_df.columns = ["tsne-2d-one", "tsne-2d-two"]
            elif dim == 3:
                tvalues_df.columns = ["tsne-3d-one", "tsne-3d-two", "tsne-3d-three"]
       
            tvalues_df = tvalues_df.set_index(z.index)
        
            t_filename = f"{MLFLOW_TRACKING_URI}/{exp_id}/{run_id}/artifacts/tsne_{dim}d_z_{suffix}.csv"
            print(t_filename)
            tvalues_df.to_csv(t_filename)



/home/rodrigo/01_repos/CardiacMotion/mlruns//8/4708d50a9ea14960b2b21d21e3313e34/artifacts/tsne_2d_z_static.csv




/home/rodrigo/01_repos/CardiacMotion/mlruns//8/4708d50a9ea14960b2b21d21e3313e34/artifacts/tsne_2d_z_dynamic.csv




/home/rodrigo/01_repos/CardiacMotion/mlruns//8/4708d50a9ea14960b2b21d21e3313e34/artifacts/tsne_2d_z_all.csv




#### Compute t-SNE

In [55]:
covariates_df = pd.read_csv("/home/rodrigo/01_repos/GWAS_pipeline/data/gwas_covariates_63k.csv")
covariates_df = covariates_df.set_index("ID")
covariates_df = covariates_df[~covariates_df.duplicated()]

In [56]:
cardiac_indices_df = pd.read_csv("/home/rodrigo/01_repos/CardiacSegmentation/data/transforms/LVED_cardiac_indices_all.csv")
cardiac_indices_df = cardiac_indices_df.set_index("ID")
# cardiac_indices_df = cardiac_indices_df.loc[]

In [57]:
common_ids = list(set(tvalues_df.index).intersection(covariates_df.index))
common_ids = set(common_ids).intersection(cardiac_indices_df.index)
covariates_df = covariates_df.loc[common_ids]
covariates_df.shape

  covariates_df = covariates_df.loc[common_ids]


(57804, 12)

In [58]:
tvalues_df = tvalues_df.loc[common_ids]
tvalues_df.shape

  tvalues_df = tvalues_df.loc[common_ids]


(57804, 2)

In [59]:
cardiac_indices_df = cardiac_indices_df.loc[common_ids]
cardiac_indices_df.shape

  cardiac_indices_df = cardiac_indices_df.loc[common_ids]


(57804, 23)

In [60]:
import os
import fileinput

# Define the root directory you want to start from
root_directory = "/home/rodrigo/01_repos/UKBB_helpers/data/subject_ids/icd10"

# Use list comprehension to count lines for each file in each directory
line_counts = {
    os.path.join(root, file): sum(1 for _ in fileinput.input(os.path.join(root, file)))
    for root, _, files in os.walk(root_directory)
    for file in files
}

line_counts = { k:v for k,v in line_counts.items() if v > 10000 }

In [61]:
ids_for_disease = {k.split("/")[-1][:-4]: set([ id.strip() for id in open(k).readlines()]) for k in line_counts}

In [62]:
icd10_codes = {
    'I269': 'Pulmonary embolism without acute cor pulmonale',
    'I519': 'Other heart diseases in diseases classified elsewhere',
    'I351': 'Nonrheumatic aortic valve disorders',
    'I848': 'Other specified disorders of arteries and arterioles',
    'I481': 'Atrial fibrillation and flutter',
    'I846': 'Spontaneous dissection of other specified arteries',
    'I495': 'Sudden cardiac death, so described',
    'I441': 'Aneurysm of artery of lower extremity',
    'I083': 'Hypertensive heart and chronic kidney disease with heart failure and stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease',
    'I639': 'Cerebral infarction, unspecified',
    'I802': 'Acute embolism and thrombosis of superior mesenteric artery',
    'I730': 'Aneurysm of aorta',
    'I249': 'Other forms of chronic ischemic heart disease',
    'I500': 'Congestive heart failure',
    'I743': 'Embolism and thrombosis of renal artery',
    'I831': 'Varicose veins of lower extremities',
    'I081': 'Hypertensive heart and chronic kidney disease with heart failure and stage 5 chronic kidney disease, or end stage renal disease',
    'I714': 'Arteritis, unspecified',
    'I460': 'Acute heart failure',
    'I771': 'Other secondary and unspecified cardiomyopathies',
    'I517': 'Cardiomegaly',
    'I251': 'Atherosclerotic heart disease of native coronary artery',
    'I859': 'Other and unspecified atherosclerosis',
    'I493': 'Chronic obstructive pulmonary disease',
    'I252': 'Old myocardial infarction',
    'I071': 'Hypertensive heart disease with heart failure',
    'I211': 'Benign neoplasm of colon',
    'I442': 'Aneurysm of artery of upper extremity',
    'I841': 'Abdominal aortic aneurysm, without rupture',
    'I313': 'Dermatitis',
    'I842': 'Abdominal aortic aneurysm, ruptured',
    'I518': 'Other and unspecified heart diseases originating in the perinatal period',
    'I451': 'Esophageal varices with bleeding',
    'I849': 'Other specified disorders of circulatory system',
    'I499': 'Unspecified disorder of circulatory system',
    'I64': 'Stroke, not specified as hemorrhage or infarction',
    'I259': 'Chronic ischemic heart disease, unspecified',
    'I350': 'Atherosclerosis of aorta',
    'I272': 'Other disorders of pulmonary circulation',
    'I471': 'Atherosclerosis of renal artery',
    'I951': 'Intraoperative hemorrhage and hematoma of a circulatory system organ or structure complicating a procedure',
    'I480': 'Paroxysmal atrial fibrillation',
    'I200': 'Angina pectoris',
    'I694': 'Granulomatosis with polyangiitis (Wegener)',
    'I447': 'Aneurysm of other specified artery',
    'I208': 'Angina pectoris, unspecified',
    'I120': 'Hypertensive heart disease with renal disease',
    'I420': 'Acute myocarditis',
    'I501': 'Left ventricular failure',
    'I959': 'Hypothyroidism, unspecified',
    'I652': 'Other transient cerebral ischemic attacks and related syndromes',
    'I48': 'Atrial fibrillation and flutter',
    'I219': 'Subacute myocardial infarction, unspecified',
    'I739': 'Peripheral vascular disease, unspecified',
    'I509': 'Heart failure, unspecified',
    'I489': 'Atrial fibrillation and flutter',
    'I10': 'Essential (primary) hypertension',
    'I209': 'Angina pectoris, unspecified',
    'I258': 'Chronic ischemic heart disease, unspecified',
    'I679': 'Cerebrovascular disease, unspecified',
    'I678': 'Cerebrovascular disease, unspecified',
    'I341': 'Mitral valve insufficiency',
    'I635': 'Cerebral infarction due to thrombosis of cerebral arteries',
    'I429': 'Cardiomyopathy, unspecified',
    'I7020': 'Acute and subacute ICD-10 code range I7020',
    'I214': 'Other acute ischemic heart diseases',
    'I080': 'Hypertensive heart and chronic kidney disease with heart failure and stage 5 chronic kidney disease, or end stage renal disease',
    'I890': 'Other specified noninfective gastroenteritis and colitis',
    'I472': 'Atherosclerosis of autologous vein coronary artery bypass graft',
    'I440': 'Aneurysm of thoracic aorta',
    'I210': 'Atherosclerotic heart disease',
    'I839': 'Other specified diseases of arteries and arterioles',
    'I340': 'Mitral valve stenosis',
}


In [64]:
all_diseased_ids = {element for value in ids_for_disease.values() for element in value}

In [65]:
healthy = pd.Series(cardiac_indices_df.index).apply(lambda id: int(str(id) not in all_diseased_ids))
healthy.index = cardiac_indices_df.index

In [74]:
@interact
def plot_tsne(
    icd10=widgets.Select(options={v:k for k,v in icd10_codes.items() if k in ids_for_disease}), 
    plot_type=widgets.Select(options=["Scatter", "KDE"], value="Scatter")):

    plt.figure(figsize=(16,7))
    ax1 = plt.subplot(1, 2, 1)
    
    ICD10 = icd10
    diagnoses = pd.Series(cardiac_indices_df.index).apply(lambda id: int(str(id) in ids_for_disease[ICD10]))
    diagnoses.index = cardiac_indices_df.index
    
    if plot_type == "Scatter": plot = partial(sns.scatterplot, x="tsne-2d-one", y="tsne-2d-two",alpha=0.5)
    elif plot_type == "KDE":   plot = partial(sns.kdeplot, x="tsne-2d-one", y="tsne-2d-two", alpha=0.5, fill=True)
    
    plot(data=tvalues_df[diagnoses.astype(bool)].head(5000), ax=ax1
        # hue = diagnoses,        
        # legend="full",
    );    
    plt.title(f"{icd10_codes[icd10]} ({diagnoses.sum()} subjects)")
    
    ax2 = plt.subplot(1, 2, 2)
    plot(data=tvalues_df[healthy.astype(bool)].head(5000), ax=ax2,
        # hue = diagnoses,
    );    
    plt.title("No cardiovascular disease")

interactive(children=(Select(description='icd10', options={'Atherosclerotic heart disease of native coronary a…