In [1]:
GWAS_REPO = "/home/rodrigo/01_repos/GWAS_pipeline/"
CARDIAC_COMA_REPO = "/home/rodrigo/01_repos/CardiacCOMA/"
CARDIAC_GWAS_REPO = "/home/rodrigo/01_repos/CardiacGWAS/"

In [2]:
import mlflow
import os, sys

import os; os.chdir(CARDIAC_COMA_REPO)
from config.load_config import load_yaml_config, to_dict

import ipywidgets as widgets
from ipywidgets import interact
from IPython.display import Image
from mlflow.tracking import MlflowClient

import pickle as pkl
import pytorch_lightning as pl

from argparse import Namespace
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from IPython import embed
sys.path.insert(0, '..')

import model.Model3D
from copy import deepcopy
from pprint import pprint

from copy import deepcopy
from typing import List
from tqdm import tqdm
from IPython import embed

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from functools import partial

In [5]:
loci_mapping = fetch_loci_mapping()

In [6]:
regions_df = pd.read_csv(f"{GWAS_REPO}/data/ld_indep_regions/fourier_ls-all_EUR_hg19_named.bed").set_index('id')
# loci_mapping_df = pd.read_csv(f"{CARDIAC_GWAS_REPO}/data/Genes - Loci mapping.csv").set_index("region")

___

In [7]:
def get_significant_loci(
    runs_df,
    experiment_id, run_id, 
    p_threshold=5e-8, 
    client=mlflow.tracking.MlflowClient()
) -> pd.DataFrame:
    
    '''    
    Returns a DataFrame with the loci that have a stronger p-value than a given threshold
    '''
    
    def get_phenoname(path):        
        filename = os.path.basename(path)
        phenoname = filename.split("__")[0]
        return phenoname
        
    run_info = runs_df.loc[(experiment_id, run_id)].to_dict()
    artifact_uri = run_info["artifact_uri"].replace("file://", "")    
           
    gwas_dir_summaries = os.path.join(artifact_uri, "GWAS/summaries")
    
    try:
        summaries_fileinfo = [ os.path.join(gwas_dir_summaries, x) for x in  os.listdir(gwas_dir_summaries) ]
    except:
        summaries_fileinfo = []
    
    if len(summaries_fileinfo) == 0:
        return pd.DataFrame(columns=["run", "pheno", "region"])
    
    region_summaries = {get_phenoname(x): os.path.join(artifact_uri, x) for x in summaries_fileinfo}
    dfs = [pd.read_csv(path).assign(pheno=pheno) for pheno, path in region_summaries.items()]
    
    df = pd.concat(dfs)
    df['locus_name'] = df.apply(lambda row: REGION_TO_LOCUS.get(row["region"], "Unnamed"), axis=1)
    df = df.set_index(["pheno", "region"])    
    
    df_filtered = df[df.P < p_threshold]
    
    return df_filtered.sort_values(by="P")

In [8]:
def summarize_loci_across_runs(runs_df: pd.DataFrame):

    '''
    Parameters: run_ids
    Return: pd.DataFrame with .
    '''

    # run_ids = sorted([x[1] for x in runs_df[runs_df["metrics.test_recon_loss"] < RECON_LOSS_THRES].index])
    run_ids = sorted([x[1] for x in runs_df.index])

    all_signif_loci = []
    
    for run_id in tqdm(run_ids):
        signif_loci_df = \
            get_significant_loci(runs_df, experiment_id=1, run_id=run_id).\
            assign(run=run_id).\
            reset_index().\
            set_index(["run", "pheno", "region"]
        )                
        all_signif_loci.append(signif_loci_df)        
      
    all_signif_loci = pd.concat(all_signif_loci)    
    return all_signif_loci

    # df = all_signif_loci.\
    #   groupby(["region", "locus_name"]).\
    #   aggregate({"CHR":"count", "P": "min"}).\
    #   rename({"CHR":"count", "P":"min_P"}, axis=1).\
    #   sort_values("count", ascending=False)    
    # 
    # return df

Collect the region-wise summaries

In [10]:
# region_assocs_df.to_csv("/home/rodrigo/01_repos/CardiacMotionGWAS/results/all_associations_best_per_region.csv")

In [221]:
results = EnsembleResults(f"/home/rodrigo/01_repos/GWAS_pipeline/output/All_partitions_spatiotemporal/summaries")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 406/406 [00:01<00:00, 271.00it/s]


In [196]:
# "/home/rodrigo/01_repos/CardiacGWAS/manuscript/tables/gwas_counts_suggestive.tex"
results.create_count_table_tex("kk.tex")

Creating output file in kk.tex


  table_code = counts_df.to_latex(escape=False, index=False)


In [None]:
results.show_counts()

In [207]:
region_assocs_df[region_assocs_df.P < 5e-10].\
  groupby(by=["run", "variable_type", "region"]).\
  aggregate({"CHR":"count", "P": "min"}).\
  rename({"CHR":"count", "P":"min_P"}, axis=1).\
  sort_values("count", ascending=False).\
  sort_values("min_P", ascending=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,min_P
run,variable_type,region,Unnamed: 3_level_1,Unnamed: 4_level_1
76f747276a9045cca6b9052dc7c590e1,static,chr6_79,1,1.258925e-23
778f608d82e046e88016c260d5a78e8f,static,chr6_78,1,1.674943e-21
778f608d82e046e88016c260d5a78e8f,static,chr6_79,1,1.909853e-20
d0213bccc36140759247dd7372a9570e,static,chr2_108,1,3.411929e-17
076776d3b17a4a83b9f3201e1e68e765,static,chr6_79,1,1.169499e-16
...,...,...,...,...
ecf5189409b34a48b80365c8a8c5f6b7,dynamic,chr2_108,1,3.213661e-10
076776d3b17a4a83b9f3201e1e68e765,dynamic,chr17_27,1,3.712787e-10
d0213bccc36140759247dd7372a9570e,static,chr2_23,1,3.923738e-10
076776d3b17a4a83b9f3201e1e68e765,static,chr2_23,1,4.251087e-10


In [17]:
# region_count_df.sort_values("min_P").reset_index().to_csv("../00_CardiacMotionRL/analysis/loci_pvals_static_vs_dynamic.csv")

In [19]:
region_count_df = region_count_df.reset_index()
region_count_df = pd.merge(loci_mapping[["candidate_gene"]], region_count_df, left_index=True, right_on='region', how='right')
region_count_df[region_count_df.variable_type == "dynamic"].sort_values("min_P").head(30)

Unnamed: 0,candidate_gene,region,variable_type,count,min_P
8,MYH6,chr14_3,dynamic,8,1.399587e-23
18,BAG3,chr10_74,dynamic,7,8.689604e-19
0,BCAT1,chr12_17,dynamic,8,1.276439e-18
4,TTN,chr2_108,dynamic,8,5.0003450000000006e-17
3,GJA1,chr6_82,dynamic,8,4.72063e-16
6,SYT10,chr12_23,dynamic,8,9.840111e-16
1,KIAA1755,chr20_22,dynamic,8,7.227698e-15
10,NKX2-5,chr5_103,dynamic,8,4.852885e-14
33,,chr22_6,dynamic,4,5.662393e-14
7,HCN4,chr15_34,dynamic,8,1.244515e-12


In [30]:
COUNT_THR = 5
PVALUE_GW_THR = 5e-8
PVALUE_SW_THR = 5e-10

region_count_df[
    ((region_count_df["count"] >= COUNT_THR) & (region_count_df.min_P < PVALUE_GW_THR)) | (region_count_df.min_P < PVALUE_SW_THR)
].sort_values("min_P")

Unnamed: 0,candidate_gene,region,variable_type,count,min_P
9,PLN,chr6_78,static,8,2.2594359999999998e-26
2,PLN,chr6_79,static,8,1.2589250000000001e-23
8,MYH6,chr14_3,dynamic,8,1.399587e-23
5,TTN,chr2_108,static,8,2.4603679999999997e-19
18,BAG3,chr10_74,dynamic,7,8.689604e-19
0,BCAT1,chr12_17,dynamic,8,1.276439e-18
4,TTN,chr2_108,dynamic,8,5.0003450000000006e-17
3,GJA1,chr6_82,dynamic,8,4.72063e-16
6,SYT10,chr12_23,dynamic,8,9.840111e-16
12,TMEM43,chr3_10,static,7,4.265795e-15


___

Export all SNPs

In [None]:
import itertools

snps_list = []

significant_regions_df = region_count_df[
    ((region_count_df.min_P < 5e-8) & (region_count_df["count"] > 5)) | 
    (region_count_df.min_P < 1.5e-10)
]

for locus in significant_regions_df.index.to_list():
    snps_list.append(region_assocs_df.iloc[
        (region_assocs_df.index.get_level_values('region') == locus) 
        & (region_assocs_df.P < 5e-8).values,
    ])    
    # print(snps_list[-1])

with open("/home/rodrigo/SNPs.txt", "wt") as snps_file:
    snps_file.write(
      "\n".join(
        list(set([x for x in itertools.chain(*[x.SNP.values for x in snps_list])]))
      )
    )

In [None]:
pd.concat(snps_list)[~pd.concat(snps_list).sort_values("SNP").duplicated("SNP")].\
  reset_index().\
  loc[:,["CHR", "BP", "region", "SNP"]].\
  to_csv("/home/rodrigo/01_repos/CardiacGWAS/significant_SNPs.csv")

This will filter the rows for the best associations per region:

In [None]:
region_assocs_df[region_assocs_df.P < 1e-7].to_csv("snp_associations_lt_1e-7.csv")

In [None]:
kk = region_assocs_df.reset_index().set_index("SNP").loc[loci_mapping_df.lead_SNP.to_list()]
kk[~kk.duplicated("AF")]["AF"].to_csv("/home/rodrigo/Downloads/MAFs.txt")

In [None]:
idx_min = region_assocs_df.groupby("region").P.idxmin()
idx_min = idx_min[significant_regions]

LEAD_SNPS_FILE = "/home/rodrigo/01_repos/CardiacMotionGWAS/results/snps_for_biomart__one_per_region.txt"
region_assocs_df.iloc[idx_min, [3,4,5,6]].reset_index(drop=True).sort_values(["CHR", "BP"]).to_csv(
    LEAD_SNPS_FILE, index=False
)

___

In [None]:
region_assocs_df[region_assocs_df.region == "chr1_118"].sort_values("P").head(20) # .index.get_level_values("region")

In [204]:
regions_w = widgets.Select(options=sorted(results.get_significant_regions()))

@interact
def show_results_for_region(region=regions_w):
    display(results.get_results_for_region(region))

interactive(children=(Select(description='region', options=('chr10_74', 'chr12_17', 'chr12_2', 'chr12_23', 'ch…

In [None]:
by_region_and_snp_df = {}

for region in tqdm(region_assocs_df.index.get_level_values("region").unique()):
    
    df = region_assocs_df.iloc[
        (region_assocs_df.index.get_level_values('region') == region) 
        & (region_assocs_df.P < 5e-7).values,
    ].sort_values("P")
    
    if df.shape[0] < 5:
        continue
    
    # print(region)
    by_region_and_snp_df[region] = df

In [None]:
@interact
def get_lead_snp_count(locus=regions_w):    
    
    return region_assocs_df.iloc[
        (region_assocs_df.index.get_level_values('region') == locus) 
        & (region_assocs_df.P < 5e-8).values,
    ].sort_values("P").head(20)

In [None]:
# loci_summary_df[loci_summary_df["count"] > 10].sort_index()

In [None]:
summarize_loci_across_runs(region_assocs_df)

In [None]:
### Replication results generated using this script: CardiacGWAS/replication.R

In [None]:
with open("/home/rodrigo/01_repos/CardiacGWAS/manuscript/tables/replication_table.tex", "wt") as repl_f:
    dd = pd.read_csv("/home/rodrigo/01_repos/CardiacGWAS/results/replication_results.csv")
    dd['replication p-value'] = [f"${str(round(float(x[0]), 1))} \times 10^{{{int(x[1])}}}$" for x in dd['replication p-value'].apply(lambda x: f"{x:.2e}".split("e"))]
    table_code = dd.to_latex(escape=False, index=False)
    table_code = table_code.replace("_", "\_")
    repl_f.write(table_code)

___

Count loci

In [None]:
loci_mapping = fetch_loci_mapping()

In [None]:
condition = (loci_mapping.PCA != "YES") & (loci_mapping.exclude != "YES") & (loci_mapping.suggestive_significance != "YES") & (loci_mapping.handcrafted.isnull())

In [None]:
loci_mapping[condition]