In [92]:
from pathlib import Path
import pandas as pd, json, ast

SRC = Path("annotated_coloc_results")
DST = Path("annotated_gpu-coloc_results")

def parse_nearest_genes(x):
    if pd.isna(x) or x == "": return "", ""
    if isinstance(x, str):
        for loader in (json.loads, ast.literal_eval):
            try:
                x = loader(x); break
            except Exception:
                pass
    if not isinstance(x, list): return "", ""
    symbols, ids, seen_s, seen_i = [], [], set(), set()
    for d in x:
        if not isinstance(d, dict): continue
        s, g = d.get("symbol"), d.get("gene_id")
        if s and s not in seen_s: seen_s.add(s); symbols.append(s)
        if g and g not in seen_i: seen_i.add(g); ids.append(g)
    return ",".join(symbols)

for f in SRC.rglob("*.tsv"):
    rel = f.relative_to(SRC)
    out = DST / rel
    out.parent.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(f, sep="\t")
    if "nearest_genes" in df.columns:
        pairs = df["nearest_genes"].map(parse_nearest_genes)
        df["gene_symbols"] = [a for a in pairs]
    else:
        df["gene_symbols"] = ""

    df.to_csv(out, sep="\t", index=False)


  df = pd.read_csv(f, sep="\t")
  df = pd.read_csv(f, sep="\t")
  df = pd.read_csv(f, sep="\t")


In [None]:

import pandas as pd
import os

pop_dir_tag = {
    "EstBB":"Est","UKBB_AFR":"AFR","UKBB_AMR":"AMR","UKBB_CSA":"CSA",
    "UKBB_EAS":"EAS","UKBB_EUR":"EUR","UKBB_MID":"MID", "meta_EUR":"meta_EUR"
}

os.makedirs("annotated_coloc_results/FinnGen_lbf_coloc", exist_ok=True)

for key, value in pop_dir_tag.items():
    metadata = pd.read_csv(f"annotated/{key}_summary_annotated.tsv", sep="\t")

    metadata = metadata[["signal", "metabolite", "maf", "log10p", "nearest_genes"]].drop_duplicates()

    metadata = metadata.rename(columns={"signal":"signal2"})

    df = pd.read_csv(f"meta_colocid/FinnGen_lbf_coloc/{key}_FinnGen_lbfs_results.tsv", sep="\t")

    df = df[df["PP.H4"]>=0.8]

    df = df.merge(metadata, on=("signal2"), how="left")

    df["diagnosis"] = df["signal1"].str.replace(r"_chr.*", "", regex=True)

    df.drop_duplicates()

    df.to_csv(f"annotated_coloc_results/FinnGen_lbf_coloc/{key}_FinnGen_lbf_coloc_results.tsv", sep="\t", index=False)

In [76]:
import pandas as pd
import os

pop_dir_tag = {
    "EstBB":"Est","UKBB_AFR":"AFR","UKBB_AMR":"AMR","UKBB_CSA":"CSA",
    "UKBB_EAS":"EAS","UKBB_EUR":"EUR","UKBB_MID":"MID", "meta_EUR":"meta_EUR"
}

os.makedirs("annotated_coloc_results/FinnGen_abf_coloc", exist_ok=True)

for key, value in pop_dir_tag.items():
    metadata = pd.read_csv(f"annotated/{key}_summary_annotated.tsv", sep="\t")

    metadata = metadata[["signal", "metabolite", "maf", "log10p", "nearest_genes"]].drop_duplicates()

    metadata = metadata.rename(columns={"signal":"signal2"})

    df = pd.read_csv(f"meta_colocid/FinnGen_abf_coloc/{key}_FinnGen_abfs_results.tsv", sep="\t")

    df = df[df["PP.H4"]>=0.8]

    df = df.merge(metadata, on=("signal2"), how="left")

    df["diagnosis"] = (
        df["signal1"]
        .str.replace(r"_chr.*", "", regex=True)
        .str.removeprefix("finngen_R12_")
    )
    df.drop_duplicates()

    df.to_csv(f"annotated_coloc_results/FinnGen_abf_coloc/{value}_FinnGen_abf_coloc_results.tsv", sep="\t", index=False)

In [77]:
import pandas as pd
import os

pop_dir_tag = {
    "EstBB":"Est","UKBB_AFR":"AFR","UKBB_AMR":"AMR","UKBB_CSA":"CSA",
    "UKBB_EAS":"EAS","UKBB_EUR":"EUR","UKBB_MID":"MID", "meta_EUR":"meta_EUR"
}

os.makedirs("annotated_coloc_results/FinnGen+MVP+UKBB_coloc", exist_ok=True)

for key, value in pop_dir_tag.items():
    metadata = pd.read_csv(f"annotated/{key}_summary_annotated.tsv", sep="\t")

    metadata = metadata[["signal", "metabolite", "maf", "log10p", "nearest_genes"]].drop_duplicates()

    metadata = metadata.rename(columns={"signal":"signal2"})

    df = pd.read_csv(f"meta_colocid/FinnGen+MVP+UKBB_coloc/{key}_FinnGen+MVP+UKBB_META_results.tsv", sep="\t")

    df = df[df["PP.H4"]>=0.8]

    df = df.merge(metadata, on=("signal2"), how="left")

    df["diagnosis"] = (
        df["signal1"]
        .str.replace(r"_chr.*", "", regex=True)
        .str.removeprefix("FinnGen+MVP+UKBB_")
    )

    df.drop_duplicates()

    df.to_csv(f"annotated_coloc_results/FinnGen+MVP+UKBB_coloc/{value}_FinnGen+MVP+UKBB_coloc_results.tsv", sep="\t", index=False)

In [None]:
import pandas as pd
import os

pop_dir_tag = {
    "EstBB":"EUR","UKBB_AFR":"AFR","UKBB_AMR":"AMR","UKBB_CSA":"CSA",
    "UKBB_EAS":"EAS","UKBB_EUR":"EUR","UKBB_MID":"MID"#, "meta_EUR":"meta_EUR"
}

os.makedirs("annotated_coloc_results/PANUKBB_coloc", exist_ok=True)

for key, value in pop_dir_tag.items():
    metadata = pd.read_csv(f"annotated/{key}_summary_annotated.tsv", sep="\t")

    metadata = metadata[["signal", "metabolite", "maf", "log10p", "nearest_genes"]].drop_duplicates()

    metadata = metadata.rename(columns={"signal":"signal2"})

    panukbb_metadata = pd.read_csv(f"PAN-UK_BB.manifest.csv")

    df = pd.read_csv(f"meta_colocid/PANUKBB_coloc/PANUKBB_{value}_{key}__results.tsv", sep="\t")

    df = df[df["PP.H4"]>=0.8]

    df = df.merge(metadata, on=("signal2"), how="left")

    panukbb_metadata = pd.read_csv(f"PAN-UK_BB.manifest.csv", usecols=["trait_type", "phenocode", "pheno_sex", "description",'description_more', "coding","modifier"])
    def _make_trait(row):
        base = f"{row['trait_type']}-{row['phenocode']}-{row['pheno_sex']}"
        if pd.notna(row.get('coding')):
            base += f"-{row['coding']}"
        if pd.notna(row.get('modifier')):
            base += f"-{row['modifier']}"
        return base
    panukbb_metadata["trait"] = panukbb_metadata.apply(_make_trait, axis=1)

    df = df[df["PP.H4"]>=0.8]
    prefix = value
    df["trait"] = df["signal1"].str.replace(r"_chr.*", "", regex=True).str.removeprefix(f"{prefix}_")
    df = df.merge(panukbb_metadata, on="trait", how="left")
    #pole optimaalne, natuke läheb kaotsi
    df.dropna(subset=["trait_type"], inplace=True)

    df.drop_duplicates(inplace=True)

    df.to_csv(f"annotated_coloc_results/PANUKBB_coloc/PANUKBB_{value}_{key}_results.tsv", sep="\t", index=False)

In [None]:
import pandas as pd
import os

pop_dir_tag = {
    "EstBB":"EstBB","UKBB_AFR":"AFR","UKBB_AMR":"AMR","UKBB_CSA":"CSA",
    "UKBB_EAS":"EAS","UKBB_EUR":"EUR","UKBB_MID":"MID"#, "meta_EUR":"meta_EUR"
}

os.makedirs("annotated_coloc_results/PANUKBB_coloc", exist_ok=True)

for key, value in pop_dir_tag.items():
    metadata = pd.read_csv(f"annotated/{key}_summary_annotated.tsv", sep="\t")

    metadata = metadata[["signal", "metabolite", "maf", "log10p", "nearest_genes"]].drop_duplicates()

    metadata = metadata.rename(columns={"signal":"signal2"})

    panukbb_metadata = pd.read_csv(f"PAN-UK_BB.manifest.csv")

    df = pd.read_csv(f"meta_colocid/PANUKBB_coloc/PANUKBB_META_HQ_{value}_results.tsv", sep="\t")

    df = df[df["PP.H4"]>=0.8]

    df = df.merge(metadata, on=("signal2"), how="left")

    panukbb_metadata = pd.read_csv(f"PAN-UK_BB.manifest.csv", usecols=["trait_type", "phenocode", "pheno_sex", "description",'description_more', "coding","modifier"])
    def _make_trait(row):
        base = f"{row['trait_type']}-{row['phenocode']}-{row['pheno_sex']}"
        if pd.notna(row.get('coding')):
            base += f"-{row['coding']}"
        if pd.notna(row.get('modifier')):
            base += f"-{row['modifier']}"
        return base
    panukbb_metadata["trait"] = panukbb_metadata.apply(_make_trait, axis=1)

    df = df[df["PP.H4"]>=0.8]
    prefix = "META_HQ"
    df["trait"] = df["signal1"].str.replace(r"_chr.*", "", regex=True).str.removeprefix(f"{prefix}_")
    df = df.merge(panukbb_metadata, on="trait", how="left")
    #pole optimaalne, natuke läheb kaotsi
    df.dropna(subset=["trait_type"], inplace=True)

    df.drop_duplicates(inplace=True)

    df.to_csv(f"annotated_coloc_results/PANUKBB_coloc/PANUKBB_META_HQ_{key}_results.tsv", sep="\t", index=False)

In [None]:
import pandas as pd
import os

pop_dir_tag = {
   "UKBB_AFR":"AFR","UKBB_AMR":"AMR","UKBB_CSA":"CSA",
    "UKBB_EAS":"EAS","UKBB_EUR":"EUR","UKBB_MID":"MID"#, "meta_EUR":"meta_EUR"
}

os.makedirs("annotated_coloc_results/PANUKBB_coloc", exist_ok=True)

for key, value in pop_dir_tag.items():
    metadata = pd.read_csv(f"annotated/meta_EUR_summary_annotated.tsv", sep="\t")

    metadata = metadata[["signal", "metabolite", "maf", "log10p", "nearest_genes"]].drop_duplicates()

    metadata = metadata.rename(columns={"signal":"signal2"})

    #MID_continuous-102540-both_sexes_chr7:92732610-94732002
    panukbb_metadata = pd.read_csv(f"PAN-UK_BB.manifest.csv")

    df = pd.read_csv(f"meta_colocid/PANUKBB_coloc/PANUKBB_{value}_meta_EUR_results.tsv", sep="\t")

    df = df[df["PP.H4"]>=0.8]

    df = df.merge(metadata, on=("signal2"), how="left")

    print(df.shape)

    panukbb_metadata = pd.read_csv(f"PAN-UK_BB.manifest.csv", usecols=["trait_type", "phenocode", "pheno_sex", "description",'description_more', "coding","modifier"])
    def _make_trait(row):
        base = f"{row['trait_type']}-{row['phenocode']}-{row['pheno_sex']}"
        if pd.notna(row.get('coding')):
            base += f"-{row['coding']}"
        if pd.notna(row.get('modifier')):
            base += f"-{row['modifier']}"
        return base
    panukbb_metadata["trait"] = panukbb_metadata.apply(_make_trait, axis=1)

    df = df[df["PP.H4"]>=0.8]
    print(df.shape)

    prefix = value
    df["trait"] = df["signal1"].str.replace(r"_chr.*", "", regex=True).str.removeprefix(f"{prefix}_")
    df = df.merge(panukbb_metadata, on="trait", how="left")
    print(df.shape)

    df.dropna(subset=["trait_type"], inplace=True)

    df.drop_duplicates(inplace=True)

    df.to_csv(f"annotated_coloc_results/PANUKBB_coloc/PANUKBB_{value}_meta_EUR_results.tsv", sep="\t", index=False)

(3898, 10)
(3898, 10)
(3898, 18)
(832, 10)
(832, 10)
(832, 18)
(4614, 10)
(4614, 10)
(4614, 18)
(1682, 10)
(1682, 10)
(1682, 18)
(544330, 10)
(544330, 10)
(544330, 18)
(726, 10)
(726, 10)
(726, 18)
