# Random Allelic Expression in the Human Body
### Stephanie N. Kravitz, Aaron R. Quinlan, Christopher Gregg

### Prior to recreating figures, first import necessary libraries and set global plot aesthetics.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from matplotlib import rc

plt.rcParams['pdf.fonttype'] = 42

%matplotlib inline
%matplotlib nbagg


## Figure 2A: Percent of Females with Significant RAE KDE plot

In [2]:
## Read in data file:
df = pd.read_csv('../data/GTEX-FEMALES.v8.all-tissues.gene_info.pctiles_zscores.txt', sep='\t')

## Remove genes with data for fewer than 5 subjects:
df = df[df['total_sample_count'] >= 5]
print(df.shape)
df.head()


(22436, 11)


Unnamed: 0,CHR,GENE_ID,GENE_NAME,gene_start,gene_stop,total_sample_count,count_fdr_0p1,pctile_fdr_0p1,z_score,ens_version,status
0,1,ENSG00000228794,LINC01128,825137,859446,133,24,0.180451,0.321969,8,autosome
1,1,ENSG00000223764,RP11-54O7.3,916869,919692,173,44,0.254335,0.808534,2,autosome
2,1,ENSG00000188976,NOC2L,944581,959309,179,18,0.100559,-0.204164,10,autosome
3,1,ENSG00000187961,KLHL17,960586,965715,160,9,0.05625,-0.495959,13,autosome
4,1,ENSG00000187583,PLEKHN1,966496,975108,194,6,0.030928,-0.662719,10,autosome


In [3]:
## Make Figure:
f, ax = plt.subplots(figsize=(3,2))

# Remove X-linked genes for autosomal distribution (which contain non-XCI genes)
df_auto = df[df['status'] == 'autosome']
df_X = df[df['status'] == 'inactive']

# KDE plot:
ax = sns.kdeplot(data=df_auto, x=df_auto['pctile_fdr_0p1']*100, shade=True, common_norm=True, color='#407e9c')
ax = sns.kdeplot(data=df_X, x=df_X['pctile_fdr_0p1']*100, shade=True, common_norm=True, color='#E0B924')

ax2 = ax.twiny()
ax2 = sns.kdeplot(data=df, x=df['z_score'], visible=False)
ax2.set_xlabel("Z-Score", fontsize=10, fontweight='light')

ax.set_xlim([-5,110])
ax.set_xlabel("Percent of Females with Significant RAE", fontsize=10, fontweight='light')
ax.set_ylim([0, 0.08])
ax.set_yticks([])
ax.set_ylabel("Density", fontsize=10, fontweight='light')


<IPython.core.display.Javascript object>

Text(0, 0.5, 'Density')

## Figure 2A (inset): Sensitivity and Specificity Curves for Autosomal vs. XCI genes

In [4]:
## Filter out Imprinted genes and HLA genes:
# add column for GENE_IDs without version #:
#df_auto[['GENE_ID','ens_version']] = df_auto['GENE_ID'].str.split('.', expand=True)

# Remove HLA genes:
df_auto = df_auto[~df_auto['GENE_NAME'].str.contains("HLA")]

# Remove Imprinted Genes:
#imprinted_df = pd.read_csv("~/Documents/Gene_Lists/GTEx_Imprinting/GTEx_Imprinted_Table_S3andS4.uniq.txt", sep=' ', names=['GENE_ID', 'chrom', 'start'])
imprinted_df = pd.read_csv("../data/GTEx_Imprinted_Table_S3andS4.uniq.txt", sep=' ', names=['GENE_ID', 'chrom', 'start'])

imprinted_genes = list(imprinted_df['GENE_ID'])
imprinted_genes
df_auto = df_auto[~df_auto['GENE_ID'].isin(imprinted_genes)]

print(df_auto.shape)


(19081, 11)


In [5]:
## Get sensitivity and specificity curves: 

## Get Z-Score cut-offs, 100 values:
z_scores = np.linspace(df['z_score'].min(), df['z_score'].max(), 100, endpoint=True)

## Sensitivity = TP / TP + FN
## Specificity = TN / TN + FP 
# Likelihood Ratio = sensitivity / 1 - specificity 

true_pos_l = []
false_neg_l = []
false_pos_l = []
true_neg_l = []
sensitivity_l = []
specificity_l = []

auto_cdf = []

for z in z_scores:
    true_pos = len(df_X[df_X['z_score'].astype(float) >= z]['GENE_ID'])
    true_pos_l.append(true_pos)
    false_neg = len(df_X['GENE_ID']) - true_pos
    false_neg_l.append(false_neg)
    false_pos = len(df_auto[df_auto['z_score'] >= z]['GENE_ID'])
    false_pos_l.append(false_pos)
    true_neg = len(df_auto['GENE_ID']) - false_pos
    true_neg_l.append(true_neg)
    sensitivity = true_pos / (true_pos + false_neg)
    sensitivity_l.append(sensitivity)
    specificity = true_neg / (true_neg + false_pos)
    specificity_l.append(specificity)
    auto_cdf.append(false_pos / (true_neg + false_pos))
    

diff_vals = {}
for z, sp, sn in zip(z_scores, specificity_l, sensitivity_l):
    diff_vals[np.absolute(sn - sp)] = z

#print(diff_vals[min(diff_vals)])
#print(diff_vals)

In [6]:
## Plot sensitivity and specificity curves:

f, ax = plt.subplots(figsize=(4,3), tight_layout=True)
sns.despine()

ax = sns.lineplot(x=z_scores, y=specificity_l, color="#407e9c", label='specificity', lw=2) 
ax = sns.lineplot(x=z_scores, y=sensitivity_l, color="#E0B924", label='sensitivity', lw=2) 
ax.axvline(x=0.74, ls='--', lw=2, color="#E35E39", label='Z=0.74')

ax.set_xlabel("Z-score", fontsize=14, fontweight='light')
ax.set_yticklabels([0, 0.0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=14, fontweight='light')
ax.set_xticklabels([-1, 0, 1, 2, 3, 4, 5, 6], fontsize=14, fontweight='light')

plt.legend(loc='upper right', bbox_to_anchor=(1, 0.9)) #fontsize='small', 


<IPython.core.display.Javascript object>

  ax.set_yticklabels([0, 0.0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=14, fontweight='light')
  ax.set_xticklabels([-1, 0, 1, 2, 3, 4, 5, 6], fontsize=14, fontweight='light')


<matplotlib.legend.Legend at 0x138301880>

## Figure 2B: Autosomal vs. XCI Z-Scores

In [7]:
## A. Auto vs. X-inact box-plots:

f, ax = plt.subplots(figsize=(1.5,2), tight_layout=True)
sns.despine()

frames = [df_auto['z_score'], df_X['z_score']]

# Kruskal-Wallis test + Dunn post-hoc:
kw_pval = stats.kruskal(df_auto['z_score'], df_X['z_score'], nan_policy='omit')

x_axis_vals = [0, 1]
y_vals = [6]
h, col = 0.15, 'k'
for x1, x2, y in zip(x_axis_vals[0::2], x_axis_vals[1::2], y_vals[0:]):
    ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=0.6, c=col)
    ax.text((x1+(x2-x1)/2), y+h, "%.2g" % kw_pval.pvalue, ha='center', va='bottom', color=col, fontsize=9, fontweight='light')

ax = sns.boxplot(data=frames, linewidth=0.5, whis=1, width=0.7, palette=["#46807D", "#E0B924"], showfliers=False)
ax.set_ylabel("z-score", fontsize=10, fontweight='light')
ax.set_xticklabels(['Auto.', 'X-Inact.'], fontsize=9, fontweight='light')
ax.axhline(y=.74,ls='--', lw=2, color="#E35E39")


<IPython.core.display.Javascript object>

<matplotlib.lines.Line2D at 0x1383c7760>

## Figure 2C: Percent of Biallelic, RAE, and Not Categorized Genes in Females and Males

In [8]:
## Import data files:

# All Tissues:
f_all = pd.read_csv('../data/GTEX-FEMALES.v8.all-tissues.gene_info.zscores.txt', sep='\t')
m_all = pd.read_csv('../data/GTEX-MALES.v8.all-tissues.gene_info.zscores.txt', sep='\t')

# Remove X-linked genes in females:
f_all = f_all[f_all['CHR'] != 'X']

# Remove HLA genes:
f_all = f_all[~f_all['GENE_NAME'].str.contains("HLA")]
m_all = m_all[~m_all['GENE_NAME'].str.contains("HLA")]

# Remove Imprinted Genes:
imprinted_df = pd.read_csv("~/Documents/Gene_Lists/GTEx_Imprinting/GTEx_Imprinted_Table_S3andS4.uniq.txt", sep=' ', names=['GENE_ID', 'chrom', 'start'])
imprinted_genes = list(imprinted_df['GENE_ID'])

# Remove ensembl ID version to match:
f_all[['GENE_ID','ens_version']] = f_all['GENE_ID'].str.split('.', expand=True)
m_all[['GENE_ID','ens_version']] = m_all['GENE_ID'].str.split('.', expand=True)

f_all = f_all[~f_all['GENE_ID'].isin(imprinted_genes)]
m_all = m_all[~m_all['GENE_ID'].isin(imprinted_genes)]

print(f_all.shape)
print(m_all.shape)

m_all.head()

(24261, 8)
(25353, 8)


Unnamed: 0,CHR,GENE_ID,GENE_NAME,gene_start,gene_stop,total_sample_count,z_score,ens_version
0,1,ENSG00000000938,FGR,27612063,27635277,167,-0.748123,12
1,1,ENSG00000000971,CFH,196651877,196747504,426,2.510607,15
2,6,ENSG00000001036,FUCA2,143494810,143511690,429,-0.091348,13
3,6,ENSG00000001167,NFYA,41072944,41099976,275,-0.604902,14
4,1,ENSG00000001460,STPG1,24356998,24416934,425,-0.117766,17


In [126]:
# function to subset dataframes to unique gene counts:
def subset_df(df):
    df = df[['CHR', 'GENE_ID', 'total_sample_count', 'z_score']]
    df = df[df['total_sample_count'] >= 5]
    df = df.drop_duplicates(subset='GENE_ID')
    return df 

# Subset dataframes to unique gene counts:
f_all = subset_df(df_auto)
m_all = subset_df(m_all)

In [9]:
# Get count of biallelic genes:
def biallelic_count(df):
    biallelic_count = df[df['z_score'] <= 0]['GENE_ID'].nunique()
    return biallelic_count
    
# Get count of polymorphic genes:
def polymorphic_count(df, z_score):
    polymorphic_count = df[(df['z_score'] > 0) & (df['z_score'] < z_score)]['GENE_ID'].nunique()
    return polymorphic_count
    
# Get count of frequent_rme genes:
def rae_count(df, z_score):
    rae_count = df[df['z_score'] >= z_score]['GENE_ID'].nunique()
    return rae_count

# Get count of ALL genes:
def total_count(df):
    total_count = df['GENE_ID'].nunique()
    return total_count

In [10]:
# Get barplot values (proportion of each group: biallelic, polymorphic, rae):
biallelic_all = [biallelic_count(f_all)/total_count(f_all), biallelic_count(m_all)/total_count(m_all)]
polymorphic_all = [polymorphic_count(f_all, .74)/total_count(f_all), polymorphic_count(m_all, .74)/total_count(m_all)]
rae_all = [rae_count(f_all, .74)/total_count(f_all), rae_count(m_all, .74)/total_count(m_all)]

print(biallelic_all)
print(polymorphic_all)
print(rae_all)

[0.6439141008202465, 0.626631956770402]
[0.1364329582457442, 0.14238946081331597]
[0.12509789373892255, 0.14164004259850904]


In [11]:
# Make Bar Plots:

f, ax = plt.subplots(figsize=(1.2,2), tight_layout=True)
sns.despine()

# The position of the bars on the x-axis
r = [0,1]

# Names of group and bar width
names = ['Females','Males']
x = np.arange(len(names))
barWidth = 0.9

# Create 1st Bar:
plt.bar(x, biallelic_all, color="#48817E", edgecolor='white', width=barWidth) #, label="All GTEx, Biallelic")

# Create 2nd bar (middle), on top of the first ones
plt.bar(x, polymorphic_all, bottom=biallelic_all, color='darkgrey', edgecolor='white', width=barWidth) #, label="All GTEx, Polymorphic Effect")

# Create 3rd bar (top), on top of the first two
plt.bar(x, rae_all, bottom=np.add(biallelic_all, polymorphic_all).tolist(), color="#CC6633", edgecolor='white', width=barWidth) #, label="All GTEx, Frequent Effect")

# Custom X axis
plt.xticks(x, names, fontsize=10, rotation=30, ha='right')
plt.yticks([0, .25, .5, .75, 1.00], [0, 25, 50, 75, 100], fontsize=8)



<IPython.core.display.Javascript object>

([<matplotlib.axis.YTick at 0x138fce160>,
  <matplotlib.axis.YTick at 0x138db59a0>,
  <matplotlib.axis.YTick at 0x13844d6d0>,
  <matplotlib.axis.YTick at 0x13908b8b0>,
  <matplotlib.axis.YTick at 0x1390b4100>],
 [Text(0, 0.0, '0'),
  Text(0, 0.25, '25'),
  Text(0, 0.5, '50'),
  Text(0, 0.75, '75'),
  Text(0, 1.0, '100')])

## Figure 2D: Venn Diagram of Biallelic and RAE genes in Females and Male samples

In [15]:
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted

In [12]:
## Load data:
df_female = pd.read_csv('../data/GTEX-FEMALES.v8.all-tissues.gene_info.zscores.txt', sep='\t')
df_male = pd.read_csv('../data/GTEX-MALES.v8.all-tissues.gene_info.zscores.txt', sep='\t')

# Subset genes tested in >= 5 people in male and female groups:
df_male = df_male[df_male['total_sample_count'] >= 5]
df_female = df_female[df_female['total_sample_count'] >= 5]

## Get intersection of genes tested between males and females:
genes_intersect = df_male.merge(df_female, how="inner", on='GENE_ID')['GENE_ID']

df_male = df_male[df_male['GENE_ID'].isin(genes_intersect)]
df_female = df_female[df_female['GENE_ID'].isin(genes_intersect)]

# BIALLELIC: Genes tested in at least 5 people, z-score <= 0:
# Males:
m_biallelic = df_male[df_male['z_score'] <= 0]
# Females:
f_biallelic = df_female[df_female['z_score'] <= 0]

# RAE: Genes tested in at least 5 people, z-score >= 2:
# Males:
m_rae = df_male[df_male['z_score'] >= 0.74]
# Females:
f_rae = df_female[df_female['z_score'] >= 0.74]



In [13]:
# Get total # of genes in Males and Females combined:
# Get Union of Male vs. Female gene lists:
def get_total(lst1, lst2):
    final_list = pd.concat([lst1, lst2]).nunique()
    return final_list

# Get Union of Male vs. Female gene lists:
def union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2))
    return len(final_list)

# Get Intersection of Male vs. Female gene lists:
def intersection(lst1, lst2):
    return len(list(set(lst1) & set(lst2)))

# Get # of genes unique to either Males only or Females only:
def subset_uniq(lst1, lst2):
    return len(lst1) - intersection(lst1, lst2)

# Fishers Exact test to get oddratio and p-value (only return p-value)
def fishers_test(lst1, lst2):
    oddsratio, pvalue = stats.fisher_exact([[(get_total(df_male['GENE_ID'], df_female['GENE_ID']) - union(lst1, lst2)), subset_uniq(lst1, lst2)], [subset_uniq(lst2, lst1), intersection(lst1, lst2)]])
    return pvalue

In [16]:
## TODO: get dataset with filtered imprinted and HLA genes ##

# Biallelic Venn:
f, ax1 = plt.subplots(figsize=(3,3), tight_layout=True)
sns.despine()
plt.suptitle("Males vs. Females: All Tissues, Biallelic \n" + "p = {}".format(str(np.format_float_scientific(fishers_test(m_biallelic['GENE_ID'], f_biallelic['GENE_ID']), precision=2))), fontsize=8)

ax1 = venn2([set(f_biallelic['GENE_ID']), set(m_biallelic['GENE_ID'])], set_labels = ("females", "males"), set_colors=('#66CCCC', '#2E423B'), alpha = 1)

ax1.get_patch_by_id('110').set_color("#46807D")

for text in ax1.set_labels:
    text.set_fontsize(10)

for text in ax1.subset_labels:
    text.set_fontsize(16)

<IPython.core.display.Javascript object>

In [17]:
# RAE Venn:
f, ax1 = plt.subplots(figsize=(3,3), tight_layout=True)
sns.despine()
plt.suptitle("Males vs. Females: All Tissues, Frequent RAE \n" + "p = {}".format(str(np.format_float_scientific(fishers_test(m_rae['GENE_ID'], f_rae['GENE_ID']), precision=2))), fontsize=8)

ax1 = venn2([set(f_rae['GENE_ID']), set(m_rae['GENE_ID'])], set_labels = ("females", "males"), set_colors=('#FF9383', '#984322'), alpha = 1)

ax1.get_patch_by_id('110').set_color("#E35E39")

for text in ax1.set_labels:
    text.set_fontsize(10) ## male/female label text size

for text in ax1.subset_labels:
    text.set_fontsize(16) ## text size of numbers in the venn circles
    


<IPython.core.display.Javascript object>

## Figure 2I: TADs enriched for RAE and Biallelic Genes