# Random Allelic Expression in the Human Body
### Stephanie N. Kravitz, Aaron R. Quinlan, Christopher Gregg

### Prior to recreating figures, first import necessary libraries and set global plot aesthetics.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from scipy.stats import fisher_exact

plt.rcParams['pdf.fonttype'] = 42

%matplotlib inline
%matplotlib nbagg



## Figure 3A: hc-RAE Gene Ontology Enrichments

In [2]:
## Make barplots for hc-RAE and hc-Biallelic Gene Ontology terms, Fold Enrichment ONLY:

df = pd.read_csv('../data/gene_ontology/RME_annotations_long-form.subset.csv', sep=',')

index = df['Index'] 
column1 = df['FoldEnrichment']
significance = df['significance']
title1 = 'Fold Enrichment'

fig, axes = plt.subplots(figsize=(5,6), tight_layout=True)
#sns.despine()

# Create bars and set titles
axes.barh(index, column1, align='center', edgecolor='black', linewidth=0.25)
axes.set_title(title1, fontsize=10, fontweight='light', pad=6)


axes.set(yticks=index, yticklabels=df.index)
axes.xaxis.tick_top() ## put x-axis on top
axes.tick_params(width=0.5, pad=0)

# Edit tick labels:
axes.set_yticklabels(df['Description'])
for label in (axes.get_xticklabels() + axes.get_yticklabels()):
    label.set(fontsize=8)

    
for i, val, sig, in zip(index, column1, significance):
    axes.text(val + 0.15, i - 0.5, sig)

# Adjust spacing
plt.subplots_adjust(wspace=0.05, top=0.9, bottom=0.1, left=0.5, right=0.99)


<IPython.core.display.Javascript object>

## Figure 3A: hc-Biallelic Gene Ontology Enrichments

In [3]:
## Make barplots for hc-RAE and hc-Biallelic Gene Ontology terms, Fold Enrichment ONLY:

df = pd.read_csv('../data/gene_ontology/Biallelic_annotations_long-form.subset.csv', sep=',')

index = df['Index'] 
column1 = df['FoldEnrichment']
significance = df['significance']
title1 = 'Fold Enrichment'

fig, axes = plt.subplots(figsize=(5,6), tight_layout=True)
#sns.despine()

# Create bars and set titles
axes.barh(index, column1, align='center', edgecolor='black', linewidth=0.25)
axes.set_title(title1, fontsize=10, fontweight='light', pad=6)


axes.set(yticks=index, yticklabels=df.index)
axes.xaxis.tick_top() ## put x-axis on top
axes.tick_params(width=0.5, pad=0)

# Edit tick labels:
axes.set_yticklabels(df['Description'])
for label in (axes.get_xticklabels() + axes.get_yticklabels()):
    label.set(fontsize=8)

    
for i, val, sig, in zip(index, column1, significance):
    axes.text(val + 0.15, i - 0.5, sig)

# Adjust spacing
plt.subplots_adjust(wspace=0.05, top=0.9, bottom=0.1, left=0.5, right=0.99)


<IPython.core.display.Javascript object>

## Figure 3B: hc-RAE Disease Ontology Enrichment

In [165]:
## The Disease Ontology Enrichment plots were made with R package ClusterProfiler
## To run R code in jupyter notebook, install rpy2:

#%pip install rpy2

## Then, load rpy2 using the following:
%load_ext rpy2.ipython

In [169]:
%%R

## You will need to install the following R packages:

#install.packages('DOSE')
#install.packages('ClusterProfiler')
#install.packages('enrichplot')
#install.packages('ggplot2')
#install.packages('cowplot')
#options(connectionObserver = NULL)
#install.packages('org.Hs.eg.db', character.only = TRUE)
#install.packages('stringr')

## Load the following R packages:

library(DOSE)
library(clusterProfiler)
library(enrichplot)
library(ggplot2)
library(cowplot)
options(connectionObserver = NULL)
library("org.Hs.eg.db", character.only = TRUE)
library(stringr)


  There is a binary version available but the source version is later:
        binary source needs_compilation
ggplot2  3.3.5  3.3.6             FALSE



R[write to console]: installing the source package ‘ggplot2’


R[write to console]: trying URL 'https://cloud.r-project.org/src/contrib/ggplot2_3.3.6.tar.gz'

R[write to console]: Content type 'application/x-gzip'
R[write to console]:  length 3061989 bytes (2.9 MB)

R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]:


The downloaded binary packages are in
	/var/folders/1t/rgnnsnln1v98rl8dvwy4q2kh0000gn/T//RtmpBKVTVH/downloaded_packages


R[write to console]: trying URL 'https://cloud.r-project.org/bin/macosx/contrib/4.0/stringr_1.4.0.tgz'

R[write to console]: Content type 'application/x-gzip'
R[write to console]:  length 210650 bytes (205 KB)

R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write


The downloaded binary packages are in
	/var/folders/1t/rgnnsnln1v98rl8dvwy4q2kh0000gn/T//RtmpBKVTVH/downloaded_packages


In [170]:
%%R


R[write to console]: 

R[write to console]: DOSE v3.16.0  For help: https://guangchuangyu.github.io/software/DOSE

If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609


R[write to console]: clusterProfiler v3.18.1  For help: https://guangchuangyu.github.io/software/clusterProfiler

If you use clusterProfiler in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.

R[write to console]: 
Attaching package: ‘clusterProfiler’


R[write to console]: The following object is masked from ‘package:stats’:

    filter


R[write to console]: Loading required package: AnnotationDbi

R[write to console]: Loading required package: stats4

R[write to

## Figure 3C: RAE and Biallelic gene enrichments for essential and non-essential genes (Hart et. al, 2017)

In [4]:
## Load GTEx z-score datasets/genes:

## All genes:
all_genes = pd.read_csv('../data/SharedGenes_MALES_and_FEMALES.v8.all-tissues.all-genes.0.74.no-HLA-Imprinted.txt', sep='\t')
print(all_genes.shape)

## RAE genes (shared M+F):
rae_genes = pd.read_csv('../data/SharedGenes_MALES_and_FEMALES.v8.all-tissues.RAE.0.74.no-HLA-Imprinted.txt', sep='\t')
rae_genes['allele_status'] = 'RAE'
print(rae_genes.shape)

## Biallelic genes (shared M+F):
biallelic_genes = pd.read_csv('../data/SharedGenes_MALES_and_FEMALES.v8.all-tissues.biallelic.0.74.no-HLA-Imprinted.txt', sep='\t')
biallelic_genes['allele_status'] = 'Biallelic'
print(biallelic_genes.shape)

all_genes.head()

(23852, 9)
(2762, 10)
(14221, 10)


Unnamed: 0,CHR,GENE_ID,GENE_NAME,gene_start,gene_stop,total_sample_count_male,z_score_male,total_sample_count_female,z_score_female
0,1,ENSG00000000938.12,FGR,27612063,27635277,167,-0.748123,98,-0.731996
1,1,ENSG00000000971.15,CFH,196651877,196747504,426,2.510607,208,2.426362
2,6,ENSG00000001036.13,FUCA2,143494810,143511690,429,-0.091348,216,-0.226136
3,6,ENSG00000001167.14,NFYA,41072944,41099976,275,-0.604902,122,-0.596496
4,1,ENSG00000001460.17,STPG1,24356998,24416934,425,-0.117766,203,-0.314898


In [5]:
## Get list of RME + Biallelic genes (exclude NA genes)

all_genes_list = list(rae_genes['GENE_NAME']) + list(biallelic_genes['GENE_NAME'])
print(len(all_genes_list))


16983


In [6]:
## Load in Gene Lists of interest: 

all_ad = pd.read_csv('../data/fig3_gene_lists/all_ad.txt', names=['GENE_NAME'])
all_ar = pd.read_csv('../data/fig3_gene_lists/all_ar.txt', names=['GENE_NAME'])
lof_tolerant = pd.read_csv('../data/fig3_gene_lists/homozygous_lof_tolerant_twohit.txt', names=['GENE_NAME'])
essential_CRISPR = pd.read_csv('../data/fig3_gene_lists/CEGv2_subset_universe.txt', names=['GENE_NAME'])
nonessential_CRISPR = pd.read_csv('../data/fig3_gene_lists/NEGv1_subset_universe.txt', names=['GENE_NAME'])
gwas_genes = pd.read_csv('../data/fig3_gene_lists/gwascatalog.txt', names=['GENE_NAME'])
essential_culture = pd.read_csv('../data/fig3_gene_lists/core_essentials_hart.txt', names=['GENE_NAME'])

all_dfs = [all_ad, all_ar, lof_tolerant, essential_CRISPR, nonessential_CRISPR, gwas_genes, essential_culture]

print(all_dfs[0].shape)
all_ad.head()

(709, 1)


Unnamed: 0,GENE_NAME
0,AARS
1,ABCA1
2,ABCC9
3,ACTA1
4,ACTA2


In [7]:
## Get Odds Ratios for each of the gene lists:

gene_list_names = ['all_ad', 'all_ar', 'lof_tolerant', 'essential_CRISPR', 'nonessential_CRISPR', 'gwas_genes', 'essential_culture']
rae_hits_list = []
rae_total_list = []
biallelic_hits_list = [] 
biallelic_total_list = []
OddsRatio_list = []
SE_list = []
pvals_list = []
#odds_ratios = pd.DataFrame(gene_list_names, columns = ['Gene_List', 'RME_hits', 'RME_total', 'Biallelic_hits', 'Biallelic_total'])

for df in all_dfs:
    ## Get a background of genes tested in the gene list of interest (i.e. make sure auto dom. genes in GTEx set)
    genes_tested = df.merge(all_genes[['GENE_ID', 'GENE_NAME']], how='inner', on='GENE_NAME')
    #print(genes_tested.shape)

    ## Subset based on list of only RME + Biallelic genes:
    genes_tested = genes_tested[genes_tested['GENE_NAME'].isin(all_genes_list)]
    #print(genes_tested.shape)
    
    ## How many RAE genes are in genes_tested?
    genes_tested = genes_tested.merge(rae_genes[['GENE_ID', 'GENE_NAME', 'allele_status']], how='left', on=['GENE_ID', 'GENE_NAME'])

    ## How many Biallelic genes are in genes_tested?
    genes_tested = genes_tested.merge(biallelic_genes[['GENE_ID', 'allele_status']], how='left', on='GENE_ID', suffixes=['', '_x'])

    ## Merge allele_status and allele_status_x to have just one column with RAE status
    genes_tested['allele_status'] = genes_tested['allele_status'].fillna(genes_tested['allele_status_x'])
    genes_tested.drop(columns=['allele_status_x'], inplace=True)
    
    ## Count # of RAE genes and Biallelic genes in genes_tested:
    rae_count = genes_tested[genes_tested['allele_status'] == 'RAE'].shape[0]
    biallelic_count = genes_tested[genes_tested['allele_status'] == 'Biallelic'].shape[0]
    rae_total = rae_genes.shape[0]
    biallelic_total = biallelic_genes.shape[0]
    
    table = np.array([[rae_count, (rae_total - rae_count)], [biallelic_count, (biallelic_total - biallelic_count)]])
    n1, n2 = table.sum(axis=1)
    #print(n1, n2)
    prop1 = table[0,0] / n1
    #print(prop1)
    prop2 = table[1,0] / n2
    #print(prop2)

    ## Calculate Odds Ratio and p-value
    oddsr, p = fisher_exact(table)
    se = np.sqrt(prop1*(1 - prop1)/n1 + prop2*(1 - prop2)/n2)
    print("OddsRatio:", oddsr, "log(OddsRatio):", np.log(oddsr), "SE:", se, "Fisher's p-val:", p)
    
    rae_hits_list.append(rae_count)
    rae_total_list.append(rae_total)
    biallelic_hits_list.append(biallelic_count)
    biallelic_total_list.append(biallelic_total)
    OddsRatio_list.append(oddsr)
    SE_list.append(se)
    pvals_list.append(p)


OddsRatio: 2.1034055889742005 log(OddsRatio): 0.7435577402145522 SE: 0.0042670258065385415 Fisher's p-val: 2.0618397809949288e-11
OddsRatio: 2.044768713990412 log(OddsRatio): 0.7152846848054738 SE: 0.005388470606860344 Fisher's p-val: 1.7255659718197603e-16
OddsRatio: 2.8662051526886283 log(OddsRatio): 1.0529889083823953 SE: 0.003071874314616458 Fisher's p-val: 5.985855913976909e-11
OddsRatio: 0.6161094828385862 log(OddsRatio): -0.484330599360235 SE: 0.0030396262350489076 Fisher's p-val: 0.0004902357096751242
OddsRatio: 1.162357093257164 log(OddsRatio): 0.1504499203913255 SE: 0.0018683114869534126 Fisher's p-val: 0.5423289933670651
OddsRatio: 1.8871341911273212 log(OddsRatio): 0.6350593773329678 SE: 0.009757988296566891 Fisher's p-val: 3.137078140680277e-44
OddsRatio: 0.9955691701766862 log(OddsRatio): -0.004440675042198431 SE: 0.0023266782730642786 Fisher's p-val: 1.0


In [8]:
## Make dataframe of results:

list_of_tuples = list(zip(gene_list_names, rae_hits_list, rae_total_list, biallelic_hits_list, biallelic_total_list, OddsRatio_list, SE_list, pvals_list))
results = pd.DataFrame(list_of_tuples, columns = ['Gene_List', 'RAE_hits', 'RAE_total', 'Biallelic_hits', 'Biallelic_total', 'OddsRatio', 'SE', 'Fishers_p'])
results

results['RAE_ratio'] = results['RAE_hits'] / results['RAE_total']
results['Biallelic_ratio'] = results['Biallelic_hits'] / results['Biallelic_total']

results.set_index('Gene_List', inplace=True)
results


Unnamed: 0_level_0,RAE_hits,RAE_total,Biallelic_hits,Biallelic_total,OddsRatio,SE,Fishers_p,RAE_ratio,Biallelic_ratio
Gene_List,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
all_ad,133,2762,334,14221,2.103406,0.004267,2.06184e-11,0.048154,0.023486
all_ar,218,2762,572,14221,2.044769,0.005388,1.725566e-16,0.078928,0.040222
lof_tolerant,69,2762,126,14221,2.866205,0.003072,5.985856e-11,0.024982,0.00886
essential_CRISPR,55,2762,454,14221,0.616109,0.00304,0.0004902357,0.019913,0.031925
nonessential_CRISPR,23,2762,102,14221,1.162357,0.001868,0.542329,0.008327,0.007172
gwas_genes,981,2762,3213,14221,1.887134,0.009758,3.1370779999999996e-44,0.355177,0.225933
essential_culture,35,2762,181,14221,0.995569,0.002327,1.0,0.012672,0.012728


In [9]:
## Make figure of Odds Ratios and p-vals for Essential and Non-Essential genes:

f, ax = plt.subplots(figsize=(1.6, 3), tight_layout=True)
sns.despine()

# Subset data to plot:
plot_3C = results.loc[['essential_CRISPR', 'nonessential_CRISPR']]

# Plot bars:
ax = sns.barplot(x=plot_3C.index, y=np.log(plot_3C['OddsRatio']), palette=["#46807D", "#F69284"])

# Plot p-values above bars:
x_axis_vals = [0, 1]
y_vals = [0.01, 0.15]
col = 'k'
for x1, y, p, in zip(x_axis_vals, y_vals, plot_3C['Fishers_p']):
    ax.text(x1, y, "%.1e" % p, ha='center', va='bottom', color=col, fontsize=8)

# Formatting:
ax.axhline(y=0, lw=1, color='k')
ax.set_ylabel("log(Odds Ratio)", fontsize=9, fontweight='light')
ax.set_xlabel("", fontsize=9, fontweight='light')
ax.set_ylim([-1, 1])
ax.set_xticklabels(labels=['essential', 'nonessential'], rotation=45, fontsize=9, ha='right')


<IPython.core.display.Javascript object>

[Text(0, 0, 'essential'), Text(1, 0, 'nonessential')]

## Figure 3D: RAE and Biallelic gene enrichments for LoF tolerant genes (Consortium et al., 2016)

In [10]:
## Make figure of Odds Ratios and p-vals for LoF tolerant enes:

f, ax = plt.subplots(figsize=(1.4, 3), tight_layout=True)
sns.despine()

# Subset data to plot:
plot_3D = results.loc[['lof_tolerant']]

# Plot bars:
ax = sns.barplot(x=plot_3D.index, y=np.log(plot_3D['OddsRatio']), color="#973A1E")

# Plot p-value above bars:
x_axis_vals = [0]
y_vals = [1.05]
col = 'k'
for x1, y, p, in zip(x_axis_vals, y_vals, plot_3D['Fishers_p']):
    ax.text(x1, y, "%.1e" % p, ha='center', va='bottom', color=col, fontsize=8)

# Formatting:
ax.axhline(y=0, lw=1, color='k')
ax.set_ylabel("log(Odds Ratio)", fontsize=9, fontweight='light')
ax.set_xlabel("", fontsize=9, fontweight='light')
ax.set_ylim([-1.2, 1.2])
ax.set_xticklabels(labels=['LoF tolerant'], rotation=45, fontsize=9, ha='right')


<IPython.core.display.Javascript object>

[Text(0, 0, 'LoF tolerant')]

## Figure 3E: pLI scores for RAE and Biallelic genes (Karczewski et al., 2020)

In [11]:
## Load pLI data:
pLI_df = pd.read_csv('../data/GTEX-ALL_SAMPLES.v8.all-tissues.gnomAD-pLI.txt', sep='\t')

# remove genes with no pLI scores:
pLI_df = pLI_df[~np.isnan(pLI_df['pLI'])]

## Add RAE vs. Biallelic status for each gene:
pLI_df['allele_status'] = np.where(pLI_df['z_score'] >= 0.74, 'RAE', 'Unknown')
pLI_df['allele_status'] = np.where(pLI_df['z_score'] <= 0, 'Biallelic', pLI_df['allele_status'])
pLI_df.head()



Unnamed: 0,#CHR,gene_start,gene_stop,GENE_ID,GENE_NAME,z_score,strand,gene_id,pLI,exac_pLI,allele_status
1,chr1,960586,965715,ENSG00000187961,KLHL17,-0.399756,+,ENSG00000187961,1.3052e-16,2.5168e-07,Biallelic
2,chr1,966496,975108,ENSG00000187583,PLEKHN1,-0.325026,+,ENSG00000187583,1.1549e-14,2.0197e-08,Biallelic
4,chr1,1399521,1401907,ENSG00000224870,RP4-758J18.2,-0.647562,+,ENSG00000224870,0.011799,,Biallelic
5,chr1,1512150,1534687,ENSG00000197785,ATAD3A,-0.653659,+,ENSG00000197785,4.0861e-09,0.040131,Biallelic
8,chr1,2050469,2175313,ENSG00000067606,PRKCZ,0.170864,+,ENSG00000067606,0.541,0.26692,Unknown


In [136]:
## Plot pLI scores of RAE vs. Biallelic genes:

f, ax = plt.subplots(figsize=(1.5,3), tight_layout=True)
sns.despine()

# Kruskal-Wallis test:
kw_pval = stats.kruskal(pLI_df[pLI_df['allele_status'] == 'RAE']['pLI'], pLI_df[pLI_df['allele_status'] == 'Biallelic']['pLI'], nan_policy='omit')

# Plot p-val above boxplot:
x_axis_vals = [0, 1]
y_vals = [1.05]
h, col = 0.01, 'k'
for x1, x2, y in zip(x_axis_vals[0::2], x_axis_vals[1::2], y_vals[0:]):
    ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=0.6, c=col)
    ax.text((x1+(x2-x1)/2), y+h, "%.2g" % kw_pval.pvalue, ha='center', va='bottom', color=col, fontsize=9, fontweight='light')

# Make plot and format axes:
ax = sns.boxplot(x='allele_status', y='pLI', data=pLI_df[pLI_df['allele_status'] != 'Unknown'], linewidth=0.5, whis=1, width=0.7, showfliers=False, palette=["#46807D","#E35E39"])
ax.set_ylabel("LoF intolerance (pLI)", fontsize=10, fontweight='light')
ax.set_xlabel("", fontsize=10, fontweight='light')
ax.set_xticklabels(['Biallelic', 'RAE'], fontsize=9, fontweight='light', rotation=45, ha='right')



<IPython.core.display.Javascript object>

[Text(0, 0, 'Biallelic'), Text(1, 0, 'RAE')]

## Figure 3F: phastCons Conservation Scores for RAE and Biallelic genes

In [12]:
## Load data:

df_Cons = pd.read_csv('../data/GTEX-ALL_SAMPLES.v8.all-tissues.gene_info.phastCons_means.bed', sep='\t', names=['CHR', 'start', 'stop', 'GENE_ID', 'GENE_NAME', 'z_score', 'strand', 'phastCons_mean'])

## Remove rows with no phastCons score (marked by ".") 
df_Cons = df_Cons[df_Cons['phastCons_mean'] != "."]

## Make sure phastCons_mean is a float
df_Cons['phastCons_mean'] = df_Cons['phastCons_mean'].astype(float)

## Add RAE vs. Biallelic status for each gene:
df_Cons['allele_status'] = np.where(df_Cons['z_score'] >= 0.74, 'RAE', 'Unknown')
df_Cons['allele_status'] = np.where(df_Cons['z_score'] <= 0, 'Biallelic', df_Cons['allele_status'])
df_Cons.head()



Unnamed: 0,CHR,start,stop,GENE_ID,GENE_NAME,z_score,strand,phastCons_mean,allele_status
0,chr1,135140,135895,ENSG00000268903.1,RP11-34P13.15,4.995202,-,0.002935,RAE
1,chr1,257863,297502,ENSG00000228463.9,AP006222.2,-0.881208,-,0.157151,Biallelic
2,chr1,585988,827796,ENSG00000230021.8,RP5-857K21.4,1.569513,-,0.145384,RAE
3,chr1,632756,633438,ENSG00000229344.1,MTCO2P12,-0.365989,+,0.015736,Biallelic
4,chr1,634376,634922,ENSG00000198744.5,MTCO3P12,-0.881208,+,0.012319,Biallelic


In [13]:
## Subset only protein coding genes for more accurate phastCons analysis:

## TODO: make this file just column of gene names, not other stuff ##
protein_coding = pd.read_csv('../data/GTEX-ALL_SAMPLES.v8.all-tissues.gene_info.zscores.proteincoding.final.txt', sep='\t')

protein_coding_genes = np.array(protein_coding['GENE_ID'])

df_Cons = df_Cons.loc[df_Cons['GENE_ID'].isin(protein_coding_genes)]


In [14]:
## Plot mean phastCons scores of RAE vs. Biallelic genes:

f, ax = plt.subplots(figsize=(1.5,2.5), tight_layout=True)
sns.despine()

# Kruskal-Wallis test:
kw_pval = stats.kruskal(df_Cons[df_Cons['allele_status'] == 'RAE']['phastCons_mean'], df_Cons[df_Cons['allele_status'] == 'Biallelic']['phastCons_mean'], nan_policy='omit')

# Plot p-val above boxplot:
x_axis_vals = [0, 1]
y_vals = [0.35]
h, col = 0.01, 'k'
for x1, x2, y in zip(x_axis_vals[0::2], x_axis_vals[1::2], y_vals[0:]):
    ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=0.6, c=col)
    ax.text((x1+(x2-x1)/2), y+h, "%.2g" % kw_pval.pvalue, ha='center', va='bottom', color=col, fontsize=9, fontweight='light')

# Make plot and format axes:
ax = sns.boxplot(x='allele_status', y='phastCons_mean', data=df_Cons[df_Cons['allele_status'] != 'Unknown'], linewidth=0.5, whis=1, width=0.7, showfliers=False, palette=["#46807D","#E35E39"]) #, palette=["#46807D", "#E0B924"]
ax.set_ylabel("Gene Conservation\nScore (phastCons)", fontsize=10, fontweight='light')
ax.set_xlabel("", fontsize=10, fontweight='light')
ax.set_xticklabels(['Biallelic', 'RAE'], fontsize=9, fontweight='light', rotation=45, ha='right')



<IPython.core.display.Javascript object>

[Text(0, 0, 'Biallelic'), Text(1, 0, 'RAE')]

## Figure 3G: RAE and Biallelic genes nearest to human Accelerated Regions (hARs)

In [15]:
## Load data, which uses +/- 100kb windows around gene boundaries:

columns = ['AR_chrom', 'AR_start', 'AR_stop', 'drop1', 'drop2', 'CHR', 'start_100kb', 'stop_100kb', 'hg19_start', 'hg19_stop', 'GENE_ID', 'GENE_NAME', 'z_score', 'mappability', 'allele_status', 'bp_overlap']

df_hARs = pd.read_csv('../data/HumanRegionsAutosomes_QV10_GTExGenes.100kb.bed', sep='\t', names=columns)
df_hARs.drop(columns=['drop1', 'drop2', 'bp_overlap'], inplace=True)
df_hARs.head()


Unnamed: 0,AR_chrom,AR_start,AR_stop,CHR,start_100kb,stop_100kb,hg19_start,hg19_stop,GENE_ID,GENE_NAME,z_score,mappability,allele_status
0,chr1,136083,136132,chr1,35803,235895,135803,135895,ENSG00000268903,RP11-34P13.15,4.9952019192892,0.5,RME
1,chr1,136083,136132,chr1,127615,367253,227615,267253,ENSG00000228463,AP006222.2,-0.8812075992518491,0.4444439999999999,Biallelic
2,chr1,1462871,1462920,chr1,1347531,1570067,1447531,1470067,ENSG00000197785,ATAD3A,-0.6536590151092356,0.527778,Biallelic
3,chr1,1462871,1462920,chr1,1270241,1478262,1370241,1378262,ENSG00000179403,VWA1,0.1476779382929402,0.6805555,Unknown
4,chr1,1462871,1462920,chr1,1451259,1665990,1551259,1565990,ENSG00000197530,MIB2,-0.1136368821054362,0.694444,Biallelic


In [16]:
## Count the number of Human ARs annotated to RAE/Biallelic/Unknown genes: 
## the total number of ARs is 1908

biallelic_ARs = df_hARs[df_hARs['allele_status'] == 'Biallelic']['GENE_ID'].nunique() 
rae_ARs = df_hARs[df_hARs['allele_status'] == 'RME']['GENE_ID'].nunique()
unknown_ARs = df_hARs[df_hARs['allele_status'] == 'Unknown']['GENE_ID'].nunique()
empty_ARs = df_hARs[df_hARs['allele_status'] == '.'].shape[0]

print(biallelic_ARs, rae_ARs, unknown_ARs, empty_ARs)

2418 544 565 390


In [17]:
## Chi-Squared test for number of Biallelic vs. RME genes in ARs: 
## Note: numbers will be slightly different from other analyses because 1087 were not converted to hg19 coordinates
#             RAE    Biallelic   
# in_AR       544    2418
# not_in_AR   2651   13311 
# total       3195   15729

# RAE vs. Biallelic: 
rae_OddsRatio = fisher_exact([[544, 2651], [2418, 13311]], alternative='greater')[0]
fisher_pval = fisher_exact([[544, 2651], [2418, 13311]], alternative='greater')[1]

f, ax = plt.subplots(figsize=(1, 2), tight_layout=True)
sns.despine()

# Make barplot:
ax = sns.barplot(x=['human ARs'], y=np.log([rae_OddsRatio]), color="#F69284")

# Plot p-value above bars:
x_axis_vals = [0]
y_vals = [0.13]
col = 'k'
for x1, y, p, in zip(x_axis_vals, y_vals, [fisher_pval]):
    ax.text(x1, y, "%.1e" % p, ha='center', va='bottom', color=col, fontsize=8)

# Formatting:
ax.axhline(y=0, lw=1, color='k')
ax.set_xticklabels(['human ARs'], fontsize=10, fontweight='light', rotation=45, ha='right')
ax.set_ylabel("human ARs\nlog(Odds Ratio)", fontsize=10, fontweight='light')
ax.set_xlabel("", fontsize=9, fontweight='light')
ax.set_ylim([-1, 1])



<IPython.core.display.Javascript object>

(-1.0, 1.0)

## Figure 3H: RAE vs. Biallelic gene enrichment for genes nearest to GWAS hits 

In [18]:
## Make figure of Odds Ratios and p-vals for GWAS genes:

f, ax = plt.subplots(figsize=(1.3, 3), tight_layout=True)
sns.despine()

# Subset data to plot:
plot_3H = results.loc[['gwas_genes']]

# Plot bars:
ax = sns.barplot(x=plot_3H.index, y=np.log(plot_3H['OddsRatio']), color="#E55D39")

# Plot p-values above bars
x_axis_vals = [0]
y_vals = np.array(np.log(plot_3H['OddsRatio']))
h, col = np.array(np.log(plot_3H['OddsRatio'])), 'k'
for x1, y, p, in zip(x_axis_vals, y_vals, plot_3H['Fishers_p']):
    ax.text(x1, y, "%.1e" % p, ha='center', va='bottom', color=col, fontsize=8)

# Formatting:
ax.axhline(y=0, lw=0.5, color='k')
ax.set_ylabel("log(Odds Ratio)", fontsize=9, fontweight='light')
ax.set_xlabel("")
ax.set_ylim([-1, 1])
ax.set_xticklabels(labels=['GWAS genes'], rotation=45, fontsize=9, ha='right')


<IPython.core.display.Javascript object>

[Text(0, 0, 'GWAS genes')]

## Figure 3I: RAE vs. Biallelic gene enrichment for OMIM disease genes with autosomal dominant or recessive modes of inheritance

In [19]:
## Make figure of Odds Ratios and p-vals for GWAS genes:

f, ax = plt.subplots(figsize=(1.6, 3), tight_layout=True)
sns.despine()

# Subset data to plot:
plot_3I = results.loc[['all_ad', 'all_ar']]

# Plot bars:
ax = sns.barplot(x=plot_3I.index, y=np.log(plot_3I['OddsRatio']), palette=["#E55235", "#E55D39"])

# Plot p-values above bars
x_axis_vals = [0, 1]
y_vals = np.array(np.log(plot_3I['OddsRatio']))
h, col = np.array(np.log(plot_3I['OddsRatio'])), 'k'
for x1, y, p, in zip(x_axis_vals, y_vals, plot_3I['Fishers_p']):
    ax.text(x1, y, "%.1e" % p, ha='center', va='bottom', color=col, fontsize=7)

# Formatting:
ax.axhline(y=0, lw=0.5, color='k')
ax.set_ylabel("log(Odds Ratio)", fontsize=9, fontweight='light')
ax.set_xlabel("")
ax.set_ylim([-1, 1])
ax.set_xticklabels(labels=['dominant', 'recessive'], rotation=45, fontsize=9, ha='right')


<IPython.core.display.Javascript object>

[Text(0, 0, 'dominant'), Text(1, 0, 'recessive')]