In [None]:
import os

import ipywidgets as widgets
from ipywidgets import interact

import numpy as np
import pandas as pd
from IPython import embed

import matplotlib.pyplot as plt
import seaborn as sns

import cardiac_motion

from cardiac_motion.utils.mlflow_read_helpers import *
import cardiac_motion_upe

from cardiac_motion_upe import (
    EnsembleGWASResults, 
    BASE_DIR,
    GWAS_RESULTS_DIR
)

___

Collect the region-wise summaries

In [None]:
gwas_dir = f"{GWAS_RESULTS_DIR}/Unsupervised_spatiotemporal"
results = EnsembleGWASResults(gwas_dir)
# chambers = ['LV', 'RV', 'BV']
results.filter_results("static_representative == 'temporal_mean' and is_variational == False")
results.keep_top_n_per_chamber(5, inplace=True)

___

In [None]:
# display( results.region_assocs_df['params.dataset_static_representative'].value_counts().to_frame() )
# display( results.region_assocs_df.chamber.value_counts().to_frame() )
# display( results.loci_count(attributes=['chamber']) )

In [None]:
results.region_assocs_df.loc[~results.region_assocs_df.sort_values("SNP").duplicated("SNP")]

In [None]:
import ipywidgets as widgets

In [None]:
@interact
def show_results_chamber(chamber=widgets.Select(options=EnsembleGWASResults.possible_chambers), variable_type=widgets.Select(options=['static', 'dynamic'])):
    
    df = ( results.
        summarize_loci_hits(extra_columns=["chamber"], collapse_attributes=["run"]).
        query("chamber == @chamber and variable_type == @variable_type").
        sort_values("min_P").
        query("(count >= 2) or (min_P < 5e-8/80)") )

    print(df.shape)
    display(df)

In [None]:
for chamber in EnsembleGWASResults.possible_chambers:
    for variable_type in ['static', 'dynamic']:
        counts_df = ( results.
            summarize_loci_hits(extra_columns=["chamber"], collapse_attributes=["run"]).
            query("chamber == @chamber and variable_type == @variable_type").
            sort_values("min_P").
            query("(count >= 2) or (min_P < 5e-8/80)") )
        # print(counts_df.shape)
        try:
            table_tex_code = results.create_count_table_tex(counts_df, tex_file=f"{chamber}_{variable_type}.tex")
        except:
            pass

In [None]:
counts_per_chamber = results.get_counts_per_chamber(p_threshold=5e-8)

FACTOR = 2
MOSTLY_DYNAMIC    =  ( counts_per_chamber["dynamic"].sum(axis=1) > counts_per_chamber["static"].sum(axis=1)*FACTOR  )
MOSTLY_STATIC     =  ( counts_per_chamber["static"].sum(axis=1)  > counts_per_chamber["dynamic"].sum(axis=1)*FACTOR )
BOTH_DYN_AND_STAT = ~( counts_per_chamber["static"].sum(axis=1)  > counts_per_chamber["dynamic"].sum(axis=1)*FACTOR) & ~(counts_per_chamber["dynamic"].sum(axis=1) > counts_per_chamber["static"].sum(axis=1)*FACTOR )

counts_per_chamber.loc[MOSTLY_DYNAMIC]   .to_csv(f"{BASE_DIR}/results/counts_mostly_dynamic.csv")
counts_per_chamber.loc[MOSTLY_STATIC]    .to_csv(f"{BASE_DIR}/results/counts_mostly_static.csv")
counts_per_chamber.loc[BOTH_DYN_AND_STAT].to_csv(f"{BASE_DIR}/results/counts_both_static_and_dynamic.csv")

In [None]:
counts_per_chamber.loc[BOTH_DYN_AND_STAT]

In [None]:
counts_per_chamber.loc[MOSTLY_STATIC]

In [None]:
counts_per_chamber.loc[MOSTLY_DYNAMIC]

___

In [None]:
EnsembleGWASResults.create_count_table_tex(
    counts_df = results.reset_index().query("chamber == 'BV' and variable_type == 'static'")
)

In [None]:
results.create_gwas_summary_table(
    results.region_assocs_df,
    f"{BASE_DIR}/results/gwas_summary_table.csv",
    p_threshold=5e-8
)

In [None]:
log10p_dynamic_df = -np.log10(results.loci_summary(only_dynamic=True).reset_index().groupby("region").min("min_P").sort_values("min_P")[["min_P"]])
log10p_static_df  = -np.log10(results.loci_summary(only_static=True) .reset_index().groupby("region").min("min_P").sort_values("min_P")[["min_P"]])

log10p_dynamic_df
log10p_static_df
# log10p_static_df.to_csv("results/log10p_static.csv")
# log10p_dynamic_df.to_csv("results/log10p_dynamic.csv")

In [None]:
table_latex_code = results.create_count_table_tex()

In [None]:
table_latex_code

In [None]:
print(table_latex_code)

In [None]:
gwas_dir = f"{GWAS_RESULTS_DIR}/All_partitions_spatiotemporal_afterNov2023/"

results = EnsembleGWASResults(gwas_dir)
( results.region_assocs_df.
      query("P < 5e-10").
      groupby(by=["run", "variable_type", "region"]).
      aggregate({"CHR":"count", "P": "min"}).
      rename({"CHR":"count", "P":"min_P"}, axis=1).
      sort_values("count", ascending=False).
      sort_values("min_P", ascending=True) )

In [None]:
# region_count_df.sort_values("min_P").reset_index().to_csv("../00_CardiacMotionRL/analysis/loci_pvals_static_vs_dynamic.csv")

In [None]:
region_count_df = region_count_df.reset_index()
region_count_df = pd.merge(loci_mapping[["candidate_gene"]], region_count_df, left_index=True, right_on='region', how='right')
region_count_df.query("variable_type" == "dynamic").sort_values("min_P").head(30)

In [None]:
COUNT_THR = 5
PVALUE_GW_THR = 5e-8
PVALUE_SW_THR = 5e-10

region_count_df[
    ((region_count_df["count"] >= COUNT_THR) & (region_count_df.min_P < PVALUE_GW_THR)) | (region_count_df.min_P < PVALUE_SW_THR)
].sort_values("min_P")

___

# Newer runs ($n_z^c=16$ and $n_z^s=16$)

In [None]:
GWAS_RESULTS_DIR_NEWER = "/mnt/data/workshop/workshop-user1/output/GWAS/All_partitions_spatiotemporal_afterNov2023/"
results_newer = EnsembleGWASResults(GWAS_RESULTS_DIR_NEWER)
results_newer.counts_per_chamber()

In [None]:
COL_ORDER = [(variable_type, chamber) for variable_type in ["dynamic", "static"] for chamber in ["BV", "LV", "RV", "LA", "RA"]]

counts_per_chamber = results.loci_count(per_chamber=True).\
  reset_index().\
  pivot(index="region", values="count", columns=["variable_type", "chamber"]).\
  fillna(0).astype(int)[COL_ORDER]

___

In [None]:
GWAS_RESULTS_DIR = "/mnt/data/workshop/workshop-user1/output/GWAS/relative_wall_thicknening/"
results = EnsembleGWASResults(GWAS_RESULTS_DIR)

In [None]:
results.region_assocs_df[(results.region_assocs_df.P < 5e-8)]

___

Export all SNPs

In [None]:
import itertools

snps_list = []

significant_regions_df = region_count_df[
    ((region_count_df.min_P < 5e-8) & (region_count_df["count"] > 5)) | 
    (region_count_df.min_P < 1.5e-10)
]

for locus in significant_regions_df.index.to_list():
    snps_list.append(region_assocs_df.iloc[
        (region_assocs_df.index.get_level_values('region') == locus) 
        & (region_assocs_df.P < 5e-8).values,
    ])    
    # print(snps_list[-1])

with open("/home/rodrigo/SNPs.txt", "wt") as snps_file:
    snps_file.write(
      "\n".join(
        list(set([x for x in itertools.chain(*[x.SNP.values for x in snps_list])]))
      )
    )

In [None]:
pd.concat(snps_list)[~pd.concat(snps_list).sort_values("SNP").duplicated("SNP")].\
    reset_index().\
    loc[:,["CHR", "BP", "region", "SNP"]].\
    to_csv("/home/rodrigo/01_repos/CardiacGWAS/significant_SNPs.csv")

This will filter the rows for the best associations per region:

In [None]:
region_assocs_df.query("P < 1e-7"].to_csv("snp_associations_lt_1e-7.csv")

In [None]:
kk = region_assocs_df.reset_index().set_index("SNP").loc[loci_mapping_df.lead_SNP.to_list()]
kk[~kk.duplicated("AF")]["AF"].to_csv("/home/rodrigo/Downloads/MAFs.txt")

In [None]:
idx_min = region_assocs_df.groupby("region").P.idxmin()
idx_min = idx_min[significant_regions]

LEAD_SNPS_FILE = "/home/rodrigo/01_repos/CardiacMotionGWAS/results/snps_for_biomart__one_per_region.txt"
region_assocs_df.iloc[idx_min, [3,4,5,6]].reset_index(drop=True).sort_values(["CHR", "BP"]).to_csv(
    LEAD_SNPS_FILE, index=False
)

___

In [None]:
regions_w = widgets.Select(options=sorted(results.get_significant_regions()))

@interact
def show_results_for_region(region=regions_w):
    display(results.get_results_for_region(region))

In [None]:
by_region_and_snp_df = {}

for region in tqdm(region_assocs_df.index.get_level_values("region").unique()):
    
    df = region_assocs_df.iloc[
        (region_assocs_df.index.get_level_values('region') == region) 
        & (region_assocs_df.P < 5e-7).values,
    ].sort_values("P")
    
    if df.shape[0] < 5:
        continue
    
    # print(region)
    by_region_and_snp_df[region] = df

In [None]:
@interact
def get_lead_snp_count(locus=regions_w):    
    
    return region_assocs_df.iloc[
        (region_assocs_df.index.get_level_values('region') == locus) 
        & (region_assocs_df.P < 5e-8).values,
    ].sort_values("P").head(20)

In [None]:
### Replication results generated using this script: CardiacGWAS/replication.R

In [None]:
with open("/home/rodrigo/01_repos/CardiacGWAS/manuscript/tables/replication_table.tex", "wt") as repl_f:
    dd = pd.read_csv("/home/rodrigo/01_repos/CardiacGWAS/results/replication_results.csv")
    dd['replication p-value'] = [f"${str(round(float(x[0]), 1))} \times 10^{{{int(x[1])}}}$" for x in dd['replication p-value'].apply(lambda x: f"{x:.2e}".split("e"))]
    table_code = dd.to_latex(escape=False, index=False)
    table_code = table_code.replace("_", "\_")
    repl_f.write(table_code)

___

Count loci

In [None]:
condition = (loci_mapping.PCA != "YES") & (loci_mapping.exclude != "YES") & (loci_mapping.suggestive_significance != "YES") & (loci_mapping.handcrafted.isnull())

In [None]:
loci_mapping.query("PCA != 'YES' and exclude != 'YES' and suggestive_significance != 'YES' and handcrafted.isnull()")