# Analysis of multi-SNP model for young and old cohorts

In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
from statannot import add_stat_annotation
from matplotlib import pyplot as plt
from glob import glob
import os.path
from os.path import exists
from os import path
import mygene
import scipy
import pydove as dv
import sqlite3
plt.rcParams["figure.figsize"] = (5,4)
import matplotlib
matplotlib.rcParams["figure.dpi"] = 200
data_dir = "/clusterfs/nilah/rkchung/data/expred/"
gtex_dir = "/clusterfs/genomicdata/GTEx/eQTL_files/"

In [None]:
# Generate mapping of ensembl id -> gene name
tissues = glob(gtex_dir+"GTEx_Analysis_v8_eQTL_expression_matrices/*.bed.gz")

genes = []
for i in range(len(tissues)):
    genes += list(pd.read_csv(tissues[i], usecols = ['gene_id'], sep="\t")["gene_id"]) # list of gtex genes - ensembl id format
genes = list(set(genes))
print("Found %s genes" % len(genes))
mg = mygene.MyGeneInfo()
ens = [g.split(".")[0] for g in genes]
ginfo = mg.querymany(ens, scopes='ensembl.gene', returnall=True)["out"]
print(len(ginfo))
counter = 0
#comm_genes = []
mapping = {} # ensg id to common name
for j in range(len(ginfo)): 
    if "symbol" in ginfo[j]:
        mapping[ginfo[j]["query"]] = ginfo[j]["symbol"]
print("%s Unmapped genes" % (len(genes)-len(mapping)))

In [None]:
# Helper function for plotting old and young heritability box/violin plots
def plot_violin(old_r2, young_r2, ax=None, show=True, zoom=False, title=None):
    y = list(old_r2)+list(young_r2)
    age = ["Old"]*len(old_r2) + ["Young"]*len(young_r2)
    devdf = pd.DataFrame(np.array([age, y]).T, columns=["Age", "R^2"])
    devdf["R^2"] = devdf["R^2"].astype(float)
    labels = ["Old", "Young"]
    ax = sns.boxplot(x="Age", y="R^2", data=devdf, order=labels, ax=ax)
    test_results = add_stat_annotation(ax, data=devdf, x="Age", y="R^2", 
                                       order=labels,
                                       box_pairs=[labels],
                                       test='Mann-Whitney', text_format='full',
                                       loc='inside', verbose=2) 
    if zoom:
        Q1 = np.percentile(sorted(young_r2), 25, interpolation = 'midpoint')
        Q3 = np.percentile(sorted(young_r2), 75, interpolation = 'midpoint')
        #stdl, stdh = np.mean(young_r2)-np.std(young_r2)/2, np.mean(young_r2)+np.std(young_r2)/2
        ax.set_ylim([Q1, Q3])
    if title!=None:
        ax.set_title(title)
    if show:
        plt.show()
        plt.clf()

In [None]:
# For each tissue, plot box-plot distribution of old and young heritability
# Annotate if significant difference 
tissue_diff = {}
j = 0
tissues = glob(gtex_dir+"GTEx_Analysis_v8_eQTL_expression_matrices/*.bed.gz")
plt.rcParams["figure.figsize"] = (25,25)
cols = rows = 5
size = rows*cols
for i in range(len(tissues)):
    if int(subprocess.check_output("zcat %s | awk '{print NF; exit}'" % tissues[i], shell=True)) > 200: # checks if num of individuals is enough
        print("Starting Tissue %s" % short)
        if j%size == 0:
            plt.show()
            plt.clf()
            fig, ax = plt.subplots(rows, cols, sharex='col', sharey='row')
        a = int(j/cols)%cols
        b = j%cols
        short = tissues[i].split("expression_matrices/")[1].split(".v8.normalized_expression")[0]
        print("Starting Tissue %s" % short)
        old_r2 = np.load(data_dir+"output/r2_%s_old.npy" % short)
        young_r2 = np.load(data_dir+"output/r2_%s_young.npy" % short)

        before = len(old_r2)
        #old_r2, young_r2, names = zip(*[[o,y,n] for o,y,n in zip(old_r2, young_r2, genes) if (o>0 and y>0)])
        old_r2, young_r2 = zip(*[[o,y] for o,y in zip(old_r2, young_r2) if (o>0 and y>0)])
        posfilter = np.array([(o>0 and y>0) for o,y in zip(old_r2, young_r2)])
        old_r2 = list(np.array(old_r2)[posfilter])
        young_r2 = list(np.array(young_r2)[posfilter])
        #print("Heritability of older indivs: %s+/-%s" % (np.mean(old_r2), np.std(old_r2)))
        #print("Heritability of younger indivs: %s+/-%s" % (np.mean(young_r2), np.std(young_r2)))
        title = "%s; $O_{R^2}$:%.3f $Y_{R^2}$:%.3f" % (short[:13], np.mean(old_r2), np.mean(young_r2))

        plot_violin(old_r2, young_r2, ax=ax[a,b], show=False, title=title)
        pval = scipy.stats.ttest_rel(old_r2, young_r2)[1]
        print(pval)
        tissue_diff[short] = [np.mean(np.array(young_r2)-np.array(old_r2)), pval]
        j+=1
plt.show()
plt.clf()

In [None]:
# Plot tissue-specific average difference in heritability between young and old cohorts
# Sup figure
plt.rcParams["figure.figsize"] = (5,5)
tissue_diff_list = sorted([[a[0],a[1][0],a[1][1]]  for a in list(tissue_diff.items())], key=lambda x: x[1], reverse=True)
print(tissue_diff_list)
df = pd.DataFrame(tissue_diff_list, columns=["Tissue", "Heritability Young - Old", "P-value"])
df["Heritability Young - Old"] = df["Heritability Young - Old"].astype(float)
ax=sns.barplot(data=df, x="Heritability Young - Old", y="Tissue")
for i in range(len(tissue_diff_list)):
    if float(tissue_diff_list[i][2])<5e-2:
        ax.annotate('**',xy=(min(np.array(tissue_diff_list)[:,1].astype(float)), i), verticalalignment="center")
plt.show()
plt.clf()

In [None]:
print("Heritability of older indivs: %s+/-%s" % (np.mean(old_r2), np.std(old_r2)))
print("Heritability of younger indivs: %s+/-%s" % (np.mean(young_r2), np.std(young_r2)))