In [5]:
from IPython.core.display import HTML
from IPython.lib.display import YouTubeVideo
from IPython.display import Image

def css_styling():
    styles = open("custom.css", "r").read()
    return HTML(styles)
css_styling()

# Intro to gene set enrichment analysis (GSEA)
### Written by Reese Richardson for use in Biol Sci 378, Winter 2022, Northwestern University (rakr@u.northwestern.edu)

Welcome back to Python! First, let's import the libraries that we'll need.

In [6]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import scipy

# 4.1 The fundamentals of GSEA

Consider the humble [tribble](https://en.wikipedia.org/wiki/Tribble), an organism with only 300 genes. 

Say we perform a gene expression experiment in the tribble's thymus before and after repeated tachyon pulse exposure. We have 10 replicates each in the control ('control') and treatment ('tachyon') group.

In [7]:
tribble_df = pd.read_csv('tribble_expression_experiment_220217.csv')
tribble_df = tribble_df.set_index('gene_name')
sample_names = tribble_df.columns

In [8]:
tribble_df

Unnamed: 0_level_0,control_0,control_1,control_2,control_3,control_4,control_5,control_6,control_7,control_8,control_9,tachyon_0,tachyon_1,tachyon_2,tachyon_3,tachyon_4,tachyon_5,tachyon_6,tachyon_7,tachyon_8,tachyon_9
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
XPQ5,664.698874,408.999121,456.243605,667.688929,377.236813,1014.659688,1685.969741,651.426939,1602.633604,1290.682237,112.592865,76.267275,232.462429,161.237868,103.494172,38.565015,71.084933,87.813068,147.149214,46.207763
CCO3,15.088205,23.592555,18.903970,15.179185,14.035329,55.304984,11.271889,16.034206,12.305053,11.262950,56.150492,11.909692,34.762178,26.202282,62.724094,22.364778,21.742087,26.614903,30.448945,42.879803
QJW3,18.908650,13.975928,12.075902,36.342257,12.838709,17.388032,39.347269,28.799523,23.125428,19.048399,58.576843,41.233798,41.946285,48.769722,83.059179,49.303907,45.016567,66.207978,53.916145,45.274500
EBN6,20.402094,44.604160,15.125772,41.111830,23.420009,35.232104,86.254813,137.936276,36.559704,27.163797,54.705556,35.897614,59.157124,55.043423,80.384513,42.614611,29.159306,38.294632,73.466202,31.787099
HRP2,11.797595,14.747975,6.922999,11.931877,20.730447,15.115598,12.558447,25.923035,22.902441,7.050126,221.323388,170.403353,69.209947,120.267446,119.176199,87.188052,58.144189,85.320047,204.856581,105.498654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ECA7,18022.540668,13547.763889,37731.397132,34926.324200,31657.321653,26417.159277,31850.863034,6461.423038,14014.375500,7321.074454,30010.884652,18018.552133,14729.356321,9202.711608,17010.133680,10562.769632,11720.014148,28667.467248,11884.578586,4073.747175
RST6,3082.033135,9016.571493,4747.837657,3436.169598,2448.532295,3680.932390,1157.172271,6137.881230,4602.268527,5868.060649,5400.560156,2873.010871,1416.955803,7034.156555,1859.256229,6878.963535,4386.597447,10267.250768,5699.904478,3761.187693
FAA9,54.991758,85.084498,39.424749,68.502921,46.407044,53.719819,34.833787,53.273648,48.205888,81.070673,298.427523,703.861536,316.959462,859.621810,388.539778,414.893759,266.628187,734.879757,654.941683,1242.523123
VFS4,4.675148,10.066881,15.947455,8.702472,10.972780,9.477237,17.384920,31.543625,15.916543,21.067065,5.287752,7.043179,5.640207,3.589583,3.923198,7.019026,7.564316,8.533002,7.081291,3.989328


Let's calculate the fold changes of each of these genes in the tachyon condition vs the control condition and perform a t-test to determine which genes are differentially expressed. We won't perform multiple hypothesis correction on our t-tests, as for GSEA, we will mostly be interested in just fold change. We'll visualize the results in a volcano plot.

In [None]:
# label our data
labels = np.array(['control']*10 + ['tachyon']*10)

# take our control and tachyon data
control_vals = tribble_df[sample_names[labels == 'control']].values
treatment_vals = tribble_df[sample_names[labels == 'tachyon']].values
# perform a t-test
tribble_df['ttest_p'] = scipy.stats.ttest_ind(control_vals.T,treatment_vals.T, equal_var=False).pvalue
# calculate log2 fold change (log2(mean(tachyon)/mean(control)))
tribble_df['log2FC'] = np.log2(np.mean(treatment_vals, axis=1)/np.mean(control_vals, axis=1))

In [None]:
fig = plt.figure(figsize=(5,5))
plt.scatter(tribble_df['log2FC'], -np.log10(tribble_df['ttest_p']))

ax = plt.gca()
ax.set_xlim([-6,6])
ax.set_ylim([0,7])
ax.axvline(0, color='k', linewidth=0.5)
ax.tick_params(labelsize=14) # set the fontsize of the tick labels
ax.set_xlabel(r'log$_2$(FC)', fontsize=16)
ax.set_ylabel(r'-log$_{10}$(pval)', fontsize=16)

In the plot above, the genes that are measured to be higher-expressed in the tachyon condition are on the right (positive values), and the genes that are higher-expressed in the control condition are on the left (negative values).

Now, we get to our question of interest: **is the DNA repair pathway up-regulated in the tribble thymus after tachyon pulse exposure?**

I've provided the consistuent members of the tribble DNA repair pathway in the variable `pathway`. Let's visualize where those genes fall on the volcano plot.

In [None]:
pathway = np.array(['DDV5', 'JND7', 'CVT6', 'PVZ6', 'PMN6', 'JMI1', 'XHQ3', 'HLE8',
       'HPC1', 'OCU4', 'FAA9', 'HPC1', 'JIW3', 'AUQ3', 'HMG6', 'SCU4',
       'OEJ5', 'TET8', 'DUK9', 'WKP7', 'TCL8', 'VXN7', 'BYT7', 'FAA9',
       'NMZ5', 'BYT7', 'JOD7', 'RQP8', 'ZXT6', 'PQJ2'])

In [None]:
print(pathway)
fig = plt.figure(figsize=(5,5))
plt.scatter(tribble_df['log2FC'], -np.log10(tribble_df['ttest_p']))
tribble_df_slice = tribble_df[tribble_df.index.isin(pathway)]
plt.scatter(tribble_df_slice['log2FC'], -np.log10(tribble_df_slice['ttest_p']), label='DNA repair pathway')

ax = plt.gca()
ax.set_xlim([-6,6])
ax.set_ylim([0,8])
ax.legend(fontsize=16)
ax.axvline(0, color='k', linewidth=0.5)
ax.tick_params(labelsize=14) # set the fontsize of the tick labels
ax.set_xlabel(r'log$_2$(FC)', fontsize=16)
ax.set_ylabel(r'-log$_{10}$(pval)', fontsize=16)

Well...maybe? Of the 30 genes that compose the DNA repair pathway, a majority of them appear to be up-regulated in the tachyon condition. However, we should search for a more quantitative answer than this. To do so, we will use a method known as [**Gene Set Enrichment Analysis**](https://www.pnas.org/content/102/43/15545).

To start, let's order our genes from highest (most positive) fold-change to lowest (most negative) fold change.

In [None]:
tribble_df_sorted = tribble_df.sort_values('log2FC', ascending=False)
tribble_df_sorted

We'll also create an array of length 300 (the number of genes in our organism), that shows which genes (after the genes are sorted by fold change) appear in our pathway. We'll store this in the variable `barcode`.

In [None]:
barcode = tribble_df_sorted.index.isin(pathway)
print(barcode)

We'll visualized the fold changes for each gene as a line plot and the barcode as, well, a barcode. In this barcode, a black cell means that gene is present in the pathway, whereas a white cell means that gene is not present in the pathway.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(8,5), gridspec_kw={'height_ratios': [3, 1]})

axes[0].plot(np.arange(300)+0.5, tribble_df_sorted['log2FC'].values)
axes[0].axhline(0, color='k', linewidth=0.5)
axes[0].set_ylabel(r'log$_2$(FC)', fontsize=16)
axes[1].imshow(barcode.reshape(-1,1).T, aspect='auto', cmap='binary')
axes[1].set_yticks([])
axes[1].set_xlabel('gene rank', fontsize=16)
axes[1].set_xlim([0,300])

for ax in axes:
    ax.tick_params(labelsize=14) # set the fontsize of the tick labels

From the barcode, we get the same impression we got from our volcano plot: the genes in the DNA repair pathway are mostly among the up-regulated genes. But let's get more quantitative:

Imagine that we are walking along the barcode from front to back and taking a running sum. We stop at each element of the barcode (i.e. each gene). Starting with a running sum of zero, when we encounter a gene that is in our pathway, we'll add $1/30$ to our running sum (1 over the number of genes in our pathway). When we encounter a gene that is *not* in our pathway, we'll subtract $1/270$ from our running sum (1 over the number of genes *not* in our pathway).

In [None]:
running_sum_storage = []
running_sum = 0
running_sum_storage.append(running_sum)
n_in_pathway = np.sum(barcode)
n_not_in_pathway = len(barcode) - np.sum(barcode)
for ele in barcode:
    if ele: # gene is in gene set
        running_sum += 1/n_in_pathway
    else: # gene is not in gene set
        running_sum -= 1/n_not_in_pathway
    running_sum_storage.append(running_sum)
running_sum_storage = np.array(running_sum_storage)

Let's now plot `running_sum_storage` versus the barcode. This running sum is known as our **running enrichment score**.

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(5,6), gridspec_kw={'height_ratios': [3,2, 1]})

axes[0].plot(running_sum_storage, color='lime')
axes[0].axhline(0.0, color='k', linewidth=0.5)
axes[0].set_ylim([-np.max(np.abs(running_sum_storage))*1.1, np.max(np.abs(running_sum_storage))*1.1])
axes[0].set_ylabel('enrichment score', fontsize=16)
axes[1].plot(np.arange(300)+0.5, tribble_df_sorted['log2FC'].values)
axes[1].axhline(0, color='k', linewidth=0.5)
axes[1].set_ylabel(r'log$_2$(FC)', fontsize=16)
axes[2].imshow(tribble_df_sorted.index.isin(pathway).reshape(-1,1).T, aspect='auto', cmap='binary')
axes[2].set_yticks([])
axes[2].set_xlabel('gene index', fontsize=16)
axes[2].set_xlim([0,300])

for ax in axes:
    ax.tick_params(labelsize=14) # set the fontsize of the tick labels

You'll see that our enrichment score trace starts at 0 and ends at 0, but peaks at some point in the middle. The point at which our running enrichment score reaches its highest deviation from zero is our pathway's overall **Enrichment Score (ES)**. For this pathway, we find an enrichment score of...

In [None]:
es_true = running_sum_storage[np.argmax(np.abs(running_sum_storage))]
print(es_true)

Notice that if our pathway's genes are mostly among our comparison's up-regulated genes, our enrichment score will be positive, as in the example below. However, if our pathway's genes are mostly among our comparison's down-regulated genes (such as in the hair synthesis pathway, stored in  `new_pathway`), our enrichment score will be negative, as in the example below.

In [None]:
new_pathway = np.array(['JQO7', 'PDX4', 'QLG5', 'RDD7', 'LZK3', 'RPV6', 'TVO1', 'ZXT6',
       'XTF6', 'NXB4', 'VFS4', 'XTF6', 'QSV2', 'JPB9', 'DBS1', 'LHE7',
       'ZAG1', 'OGU7', 'AKV4', 'SSI9', 'BNF3', 'GEW2', 'BUS2', 'VFS4',
       'KYJ7', 'BUS2', 'ZXT6', 'DJQ5', 'ERR1', 'CDG2'])

barcode = tribble_df_sorted.index.isin(new_pathway)

running_sum_storage = []
running_sum = 0
running_sum_storage.append(running_sum)
n_in_pathway = np.sum(barcode)
n_not_in_pathway = len(barcode) - np.sum(barcode)
for ele in barcode:
    if ele: # gene is in gene set
        running_sum += 1/n_in_pathway
    else: # gene is not in gene set
        running_sum -= 1/n_not_in_pathway
    running_sum_storage.append(running_sum)
running_sum_storage = np.array(running_sum_storage)

fig, axes = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(5,6), gridspec_kw={'height_ratios': [3,2, 1]})

axes[0].plot(running_sum_storage, color='lime')
axes[0].axhline(0.0, color='k', linewidth=0.5)
axes[0].set_ylim([-np.max(np.abs(running_sum_storage))*1.1, np.max(np.abs(running_sum_storage))*1.1])
axes[0].set_ylabel('enrichment score', fontsize=16)
axes[1].plot(np.arange(300)+0.5, tribble_df_sorted['log2FC'].values)
axes[1].axhline(0, color='k', linewidth=0.5)
axes[1].set_ylabel(r'log$_2$(FC)', fontsize=16)
axes[2].imshow(barcode.reshape(-1,1).T, aspect='auto', cmap='binary')
axes[2].set_yticks([])
axes[2].set_xlabel('gene index', fontsize=16)
axes[2].set_xlim([0,300])

for ax in axes:
    ax.tick_params(labelsize=14) # set the fontsize of the tick labels

For the DNA repair pathway, we find our enrichment score to be positive! Thus, you might think that our pathway is up-regulated (AKA "enriched").

Let's not warp jump to conclusions! We want to know whether or not this enrichment is of any statistical significance. Thus, we should compare how this enrichment score compares to the distribution of enrichment scores we would expect by random chance alone (AKA the null the distribution).

But how do we generate this null distribution of enrichment scores?

### 4.1.1 GSEA with phenotype permutation

Let's perform the same procedure for calculating the enrichment score, but this time, before we calculate fold changes between the two conditions, we'll randomly shuffle (or "permute") the labels on our data so that some of the true control groups are labeled as treatment groups and some of the treatment groups are labeled as control.

In [None]:
np.random.seed(1701)

# randomly permute our labels
new_labels = np.random.permutation(labels)

control_vals = tribble_df[sample_names[new_labels == 'control']].values
treatment_vals = tribble_df[sample_names[new_labels == 'tachyon']].values

# calculate log2 fold change (log2(mean(tachyon)/mean(control)))
tribble_df['log2FC'] = np.log2(np.mean(treatment_vals, axis=1)/np.mean(control_vals, axis=1))

tribble_df_sorted = tribble_df.sort_values('log2FC', ascending=False)

barcode = tribble_df_sorted.index.isin(pathway)

running_sum_storage = []
running_sum = 0
running_sum_storage.append(running_sum)
n_in_pathway = np.sum(barcode)
n_not_in_pathway = len(barcode) - np.sum(barcode)
for ele in barcode:
    if ele: # gene is in gene set
        running_sum += 1/n_in_pathway
    else: # gene is not in gene set
        running_sum -= 1/n_not_in_pathway
    running_sum_storage.append(running_sum)
running_sum_storage = np.array(running_sum_storage)

fig, axes = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(5,6), gridspec_kw={'height_ratios': [3,2, 1]})

axes[0].plot(running_sum_storage, color='lime')
axes[0].axhline(0.0, color='k', linewidth=0.5)
axes[0].set_ylim([-np.max(np.abs(running_sum_storage))*1.1, np.max(np.abs(running_sum_storage))*1.1])
axes[0].set_ylabel('enrichment score', fontsize=16)
axes[1].plot(np.arange(300)+0.5, tribble_df_sorted['log2FC'].values)
axes[1].axhline(0, color='k', linewidth=0.5)
axes[1].set_ylabel(r'log$_2$(FC)', fontsize=16)
axes[2].imshow(tribble_df_sorted.index.isin(pathway).reshape(-1,1).T, aspect='auto', cmap='binary')
axes[2].set_yticks([])
axes[2].set_xlabel('gene index', fontsize=16)
axes[2].set_xlim([0,300])

for ax in axes:
    ax.tick_params(labelsize=14) # set the fontsize of the tick labels

If we randomly permute the labels of our data (AKA the "phenotype" of each sample), we get a lower enrichment score than before. If we permute 1000 times, what distribution of enrichment scores do we get? Let's find out!

In [None]:
np.random.seed(1701)

n_permutations = 1000
es_null_storage_array = []
for permute_n in range(n_permutations):
    new_labels = np.random.permutation(labels)

    control_vals = tribble_df[sample_names[new_labels == 'control']].values
    treatment_vals = tribble_df[sample_names[new_labels == 'tachyon']].values

    # calculate log2 fold change (log2(mean(tachyon)/mean(control)))
    tribble_df['log2FC'] = np.log2(np.mean(treatment_vals, axis=1)/np.mean(control_vals, axis=1))

    tribble_df_sorted = tribble_df.sort_values('log2FC', ascending=False)

    barcode = tribble_df_sorted.index.isin(pathway)

    running_sum_storage = []
    running_sum = 0
    running_sum_storage.append(running_sum)
    n_in_pathway = np.sum(barcode)
    n_not_in_pathway = len(barcode) - np.sum(barcode)
    for ele in barcode:
        if ele: # gene is in gene set
            running_sum += 1/n_in_pathway
        else: # gene is not in gene set
            running_sum -= 1/n_not_in_pathway
        running_sum_storage.append(running_sum)
    running_sum_storage = np.array(running_sum_storage)
    # get value of max deviation from zero
    es_null_storage_array.append(running_sum_storage[np.argmax(np.abs(running_sum_storage))])

es_null_storage_array = np.array(es_null_storage_array)

In [None]:
plt.figure(figsize=(8,5))
sns.distplot(es_null_storage_array, bins=np.arange(-1,1.01,0.1))

ax = plt.gca()
ax.axvline(es_true, color='r', label='true ES')
ax.legend(fontsize=14)
ax.set_xlabel('ES (null)', fontsize=16)
ax.set_ylabel('density', fontsize=16)
ax.tick_params(labelsize=14)

When we plot the distribution of expected enrichments scores, we find a symmetric bimodal distribution, and find that our true enrichment score (`es_true`) is well higher than the vast majority of enrichment scores. Among the positive enrichment scores from the null, the probability of finding an ES greater than or equal to our true enrichment score is...

In [None]:
pos_es = es_null_storage_array[es_null_storage_array > 0] # get array of positive enrichment scores
print(np.sum(pos_es >= es_true)/len(pos_es))

This is our **empirical p-value**. And thus, we've determined significant enrichment! In other words, at p < 0.05, we find the DNA repair pathway to be significantly up-regulated.

The distribution of enrichment scores from the null will change based on the size of the pathway you test and the number of genes in your gene set. We can normalize our true enrichment score by dividing it by the mean of the absolute value of null enrichment scores of the same sign (i.e. if our enrichment score is negative, divide our enrichment score by the absolute value of the mean of the negative enrichment scores from the null). This quantity is known as the **Normalized Enrichment Score (NES)**. If you are performing GSEA on multiple gene sets, the NES allows you to compare which gene sets are "more enriched" than the others. The [original GSEA paper](https://www.pnas.org/content/102/43/15545) recommends normalizing each null enrichment score across all gene sets and using your computed true NES relative to this distribution to calculate q-values (i.e. to estimate the FDR). This is generally how multiple-hypothesis correction is performed in GSEA.

In [None]:
nes = es_true/np.mean(np.abs(pos_es))
print(nes)

### 4.1.2 GSEA with gene permutation

When we permute our labels with $k$ replicates in our treatment condition out of $n$ samples in total, there are 

$${n \choose k}$$ 

different arrangements of labels we can make. For $k = 10$ and $n = 20$, as we've explored above, we find that 

$${n \choose k} = 184,756.$$ 

Hence, we can compute 184,756 unique null enrichment scores. However, if we had far fewer replicates, we couldn't make as many unique enrichment scores and thus would wind up with some repeated null observations when we permute a finite number of times. For instance, if $k = 3$ and $n = 7$, we find that 

$${n \choose k} = 35.$$ 

Thirty-five unique null enrichment scores is not really enough to create a suitable null distribution from which we can determine an empirical p-value!

If you find yourself in this situation, you should first consider whether or not your RNA-seq experiment is adequately statistically powered or if you need to run more replicates. Once you've decided to ignore how underpowered your RNA-seq experiment is, you can perform GSEA with gene permutation.

In this variation of GSEA, instead of permuting phenotype labels and recalculating fold-changes, we stick with the fold changes we obtain with the correct phenotype labels and instead permute the set of genes that are in our pathway (i.e. scramble our barcode). In other words, for a gene set of size $m$, each random permutation will select $m$ random genes. The cell below demonstrates this method.

In [None]:
####### Finding true ES

# label our data
labels = np.array(['control']*10 + ['tachyon']*10)

# take our control and tachyon data
control_vals = tribble_df[sample_names[labels == 'control']].values
treatment_vals = tribble_df[sample_names[labels == 'tachyon']].values

# calculate log2 fold change (log2(mean(tachyon)/mean(control)))
tribble_df['log2FC'] = np.log2(np.mean(treatment_vals, axis=1)/np.mean(control_vals, axis=1))

tribble_df_sorted = tribble_df.sort_values('log2FC', ascending=False)

barcode = tribble_df_sorted.index.isin(pathway)

running_sum_storage = []
running_sum = 0
running_sum_storage.append(running_sum)
n_in_pathway = np.sum(barcode)
n_not_in_pathway = len(barcode) - np.sum(barcode)
for ele in barcode:
    if ele: # gene is in gene set
        running_sum += 1/n_in_pathway
    else: # gene is not in gene set
        running_sum -= 1/n_not_in_pathway
    running_sum_storage.append(running_sum)
running_sum_storage = np.array(running_sum_storage)

es_true = running_sum_storage[np.argmax(np.abs(running_sum_storage))]

####### Permuting genes

np.random.seed(1701)

n_permutations = 10_000
es_null_storage_array = []
for permute_n in range(n_permutations):

    barcode = tribble_df_sorted.index.isin(pathway)
    new_barcode = np.random.permutation(barcode)

    running_sum_storage = []
    running_sum = 0
    running_sum_storage.append(running_sum)
    n_in_pathway = np.sum(barcode)
    n_not_in_pathway = len(barcode) - np.sum(barcode)
    for ele in new_barcode:
        if ele: # gene is in gene set
            running_sum += 1/n_in_pathway
        else: # gene is not in gene set
            running_sum -= 1/n_not_in_pathway
        running_sum_storage.append(running_sum)
    running_sum_storage = np.array(running_sum_storage)
    # get value of max deviation from zero
    es_null_storage_array.append(running_sum_storage[np.argmax(np.abs(running_sum_storage))])
    
es_null_storage_array = np.array(es_null_storage_array)

In [None]:
plt.figure(figsize=(8,5))
sns.distplot(es_null_storage_array, bins=np.arange(-1,1.01,0.1))

ax = plt.gca()
ax.axvline(es_true, color='r', label='true ES')
ax.legend(fontsize=14)
ax.set_xlabel('ES (null)', fontsize=16)
ax.set_ylabel('density', fontsize=16)
ax.tick_params(labelsize=14)

In [None]:
pos_es = es_null_storage_array[es_null_storage_array > 0] # get array of positive enrichment scores
print('Empirical p-value: ')
print(np.sum(pos_es >= es_true)/len(pos_es))

nes = es_true/np.mean(np.abs(pos_es))
print('NES: ')
print(nes)

Notice that our null distribution of ES values is tighter than before, leaving our true ES even further outside the realm of expectation. Not a single one of our null ES values falls above our actual ES value! Our NES is also much higher. As with many other procedures in genomics, the outcome of our GSEA is highly dependent on the exact parameterization of our procedure.

One other thing you should notice: we don't need any sample-specific data when performing GSEA with gene permutation! We only need the ranked list of fold-changes between the conditions for each gene.

# 4.2 GSEA with WebGestalt

As with over-representation analysis, you are probably using GSEA on a large body of gene ontologies and pathways to determine which pathways are enriched. This is a slog to do manually, but our friend [WebGestalt](http://www.webgestalt.org/) makes it quite easy!

WebGestalt is (currently) only capable of performing GSEA with gene permutation (also known as GSEA pre-ranked). As input, it takes tab-delimited file with genes in one column and fold-changes in the other (file extension `.rnk`). You can find a brief tutorial on how to use WebGestalt for GSEA here **(Note 2/18/22: I have not yet recorded this lecture! This will be the same lecture I give on 2/18/22 in class)**.

Let's perform GSEA when comparing esophageal adenocarcinoma to normal esophagus. The cells below will import the expression data, as before, then create a file just showing the fold-change and gene name, which you can use as input in WebGestalt. When you generate a report, you'll notice some familiar-looking plots!

In [None]:
de_df = pd.read_csv('E-MTAB-4054-query-results.tsv', sep='\t', header=3)
de_df

In [None]:
de_df_slice = de_df[["Gene Name", "'esophageal adenocarcinoma' vs 'normal' .foldChange"]].dropna()
de_df_slice.to_csv('eac_vs_normal_fc_0.rnk', index=False, header=False, sep='\t')

# 4.3 A few quick notes on GSEA

### 4.3.1 "Leading edge" subset

Sometimes in GSEA you will hear a set of genes referred to as the "leading edge" subset. If the overall enrichment score is positive, this is the set of genes that appear before the peak of our running enrichment score plot. If the enrichment score is negative, this is the set of genes that appear after the trough of the enrichment score plot. The leading edge subset, in theory, represents the set of genes that "drive" enrichment. The cells below identify the leading edge subset of the tribble comparison.

In [None]:
np.random.seed(1701)


control_vals = tribble_df[sample_names[labels == 'control']].values
treatment_vals = tribble_df[sample_names[labels == 'tachyon']].values

# calculate log2 fold change (log2(mean(tachyon)/mean(control)))
tribble_df['log2FC'] = np.log2(np.mean(treatment_vals, axis=1)/np.mean(control_vals, axis=1))

tribble_df_sorted = tribble_df.sort_values('log2FC', ascending=False)

barcode = tribble_df_sorted.index.isin(pathway)

running_sum_storage = []
running_sum = 0
running_sum_storage.append(running_sum)
n_in_pathway = np.sum(barcode)
n_not_in_pathway = len(barcode) - np.sum(barcode)
for ele in barcode:
    if ele: # gene is in gene set
        running_sum += 1/n_in_pathway
    else: # gene is not in gene set
        running_sum -= 1/n_not_in_pathway
    running_sum_storage.append(running_sum)
running_sum_storage = np.array(running_sum_storage)

fig, axes = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(5,6), gridspec_kw={'height_ratios': [3,2, 1]})

axes[0].plot(running_sum_storage, color='lime')
axes[0].axvline(np.argmax(np.abs(running_sum_storage)), color='k', linestyle='dashed')
axes[0].axhline(0.0, color='k', linewidth=0.5)
axes[0].set_ylim([-np.max(np.abs(running_sum_storage))*1.1, np.max(np.abs(running_sum_storage))*1.1])
axes[0].set_ylabel('enrichment score', fontsize=16)
axes[1].plot(np.arange(300)+0.5, tribble_df_sorted['log2FC'].values)
axes[1].axhline(0, color='k', linewidth=0.5)
axes[1].set_ylabel(r'log$_2$(FC)', fontsize=16)
axes[2].imshow(tribble_df_sorted.index.isin(pathway).reshape(-1,1).T, aspect='auto', cmap='binary')
axes[2].set_yticks([])
axes[2].set_xlabel('gene index', fontsize=16)
axes[2].set_xlim([0,300])

for ax in axes:
    ax.tick_params(labelsize=14) # set the fontsize of the tick labels

In [None]:
tribble_df_sorted[(np.arange(300) <= np.argmax(np.abs(running_sum_storage))) & 
                              (tribble_df_sorted.index.isin(pathway))]['log2FC']

### 4.3.2 Ranking non-differentially-expressed genes

Some GSEA users are uncomfortable ranking genes by fold-change, for a couple of reasons:
* Genes with higher fold-changes do not necessarily affect the behavior of gene pathways to a greater degree than genes with lower fold-changes
* Estimates of fold-change are highly subject to variation across experiments, and thus the rank of the list can be highly variable.
* Estimates of fold-change are very sensitive to outliers (while phenotype permutation accounts for this, gene permutation does not).

As a result, sometimes users will opt for a different approach. The two most common approaches are:
* Rank by fold-change after excluding all non-differentially-expressed genes (only really works if number of up-regulated and down-regulated genes are relatively well-balanced)
* Rank by log10(p-value) * sign of change (a gene with a p value of 0.05 and a fold change of -2.3 would have a score of -np.log10(0.05)(-1) = -1.3  and a gene with a p-value of 0.10 and a fold-change of 0.5 would have a score of -np.log10(0.1)(1) = 1.

I won't comment on the validity of each of these approaches (each could be most appropriate depending on the circumstances) but you will see all of them employed fairly often!

# Congratulations!
## You have completed this lesson on GSEA!

To practice the skills you've acquired thus far, complete the excercises below.

### Exercise 3.1

Perform GSEA to assess whether the set of genes concerning tribble reproduction (stored in `pathway` below) is up- or down-regulated after tachyon pulse exposure. Perform phenotype permutation and find a p-value.

In [None]:
pathway = np.array(['JMT3', 'XVI6', 'LJW1', 'HGE8', 'IRT4', 'DXN6', 'OVG3', 'OVG3',
       'SMH2', 'DUK9', 'RNF4', 'CJY3', 'HBD9', 'ICP4', 'TAN8', 'RDC4',
       'PDX4', 'JMT3', 'QVM1', 'XBE7', 'LFM7', 'RCX2', 'ELG9', 'JYP9',
       'OIA6', 'ANG5', 'SNX2', 'XTF6', 'SOO4', 'NLW8', 'SGI6', 'SMH2',
       'RSI4', 'CJY3', 'EDR2', 'CXM8', 'RST6', 'ZWG6', 'QCX9', 'OIA6',
       'DDT5', 'ZAG1', 'HRP2', 'YDJ8', 'AZT9', 'MJS2', 'JPB9', 'LFA1',
       'UDS8', 'TPD7'])

### Exercise 3.2

The genes stores in `pathway` below represent the genes related to the Klingon defense pathway in tribbles. In the tachyon vs. control comparison, what genes in this pathway form the leading-edge subset?

In [None]:
pathway = np.array(['GPS8', 'KRV5', 'CCO3', 'RST6', 'PKP1', 'QSV2', 'ERR1', 
                    'XPQ5', 'XVI6', 'UWD1', 'AKV4', 'QLG5', 'PNL2', 'XDP8', 
                    'XPQ5'])

### Exercise 3.3

What pathways are the three most up-regulated and three most down-regulated (read: highest and lowest NES) in the lung during COVID-19? Rank your genes by fold-change (dropping any genes with NaN fold change) and use `Pathways > KEGG` as your functional database in your WebGestalt input. Upload the `.html` file of your report alongside your Jupyter Notebook (hit `Result Download`, and you will find the associated `.html` in the `.zip` file that downloads).

In [None]:
de_df = pd.read_csv('E-ENAD-46-query-results.tsv', sep='\t', header=3)

In [None]:
de_df

# Appendix:

See below for code used to create synthetic tribble data:

In [None]:
n_genes = 300
n_replicates = 10
np.random.seed(100)
alpha = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
numeric = '123456789'
df = pd.DataFrame({'gene_name':[], 'log2FC':[], 'FDR':[]})
for gene in range(n_genes):
    exp_val = np.random.normal(loc=0, scale=2)
    df_slice = pd.DataFrame({'gene_name':[''.join(np.random.choice(list(alpha), 3)) + ''.join(np.random.choice(list(numeric), 1))],
                 'log2FC':[exp_val], 'FDR':[10**(-(np.abs(exp_val)**2)*np.random.uniform(0.05,0.25))]})
    df = df.append(df_slice)
    
df = df.reset_index(drop=True)

baseline = 2**np.random.normal(loc=7, scale=3.5, size=n_genes)
treatment = pd.DataFrame(2**np.random.normal(loc=df['log2FC'], scale=0.75, size=(n_replicates,n_genes)).T).mul(baseline, axis=0)
control = pd.DataFrame(2**np.random.normal(loc=0, scale=0.75, size=(n_replicates,n_genes)).T).mul(baseline, axis=0)
control.columns = ['control_' + str(x) for x in range(n_replicates)]
treatment.columns = ['tachyon_' + str(x) for x in range(n_replicates)]
labels = np.array(list(['control']*n_replicates) + list(['tachyon']*n_replicates))
df[control.columns] = control
df[treatment.columns] = treatment

df = df[df.columns[(df.columns != 'log2FC') & (df.columns != 'FDR')]]
df

# Write to .csv (overwrite at your own risk!)
#df.to_csv('tribble_expression_experiment_220217.csv', index=False)