Plot GSEA analysis

In [None]:
# python modules
import sys
import copy
import random
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import scipy
from scipy import stats

In [None]:
# custom modules
import graphics_edit as graphics
import load_file_edit as load_file
import Interval_dict
import statis_edit as statis

In [None]:
# matplotlib setting
%matplotlib inline
mpl.rcParams["figure.facecolor"] = "white"
mpl.rcParams["axes.facecolor"] = "white"
mpl.rcParams["savefig.facecolor"] = "white"

In [None]:
def select_best (gs_list, num):
    nes_gs = []
    for gs in gs_list:
        nes = gs['nes']
        nes_gs.append((abs(nes), gs))
    nes_gs = sorted(nes_gs, reverse=True)
    return [gs for _, gs in nes_gs[:num]]


In [None]:
### compute the Cohen's kappa of all gene-set pairs
def get_kappa_matrix (gs_list,
                      gene_list=None):
    # make union gene_list to be considered
    if gene_list == None:
        gene_list = set([])
        for gs in gs_list:
            gene_list |= set(gs['genes'].keys())
        gene_list = list(gene_list)

    # make binary vector for each gene-set
    idx_bvec = []
    for i in range(len(gs_list)):
        gs = gs_list[i]
        bvec = []
        for gene in gene_list:
            try:
                gs['genes'][gene]
                bvec.append(1)
            except:
                bvec.append(0)
        idx_bvec.append(bvec)

    # make kappa matrix
    kappa_matrix = np.zeros((len(gs_list), len(gs_list)))
    kappa_matrix[:] = np.nan
    for i in range(len(gs_list)):
        for j in range(i, len(gs_list)):
            bvec1, bvec2 = idx_bvec[i], idx_bvec[j]
            kappa = statis.get_kappa(bvec1, bvec2)
            kappa_matrix[i][j] = kappa
            kappa_matrix[j][i] = kappa

    return gene_list, kappa_matrix

In [None]:
# convert kappa matrix to distance matrix
def kappa_to_dist (kappa_matrix, scale=1.0):
    nrow, ncol = kappa_matrix.shape
    dist_matrix = np.zeros((nrow, ncol))
    for i in range(nrow):
        for j in range(ncol):
            if i == j:
                dist_matrix[i][j] = 0
                continue
            kappa = kappa_matrix[i][j]
            dist = scale*(1.0 - kappa)
            #dist_matrix[i][j] = np.exp(dist)
            dist_matrix[i][j] = dist
    return dist_matrix

In [None]:
# clean gene-set name
def clean_gsname(gsname):
    capital_words = ['B', 'STAT', 'H3K4ME2', 'H3K4ME3', 'H3K27ME3',
                     'IPS', 'NPC', 'HCP', 'HDAC', 'ICP']
    words = gsname.split('_')[1:]
    for k in range(len(words)):
        if words[k] in capital_words:
            continue
        words[k] = words[k].lower()
    words[0] = words[0][0].upper() + words[0][1:]
    new_name = ' '.join(words)
    return new_name 


In [None]:
### load GSEA rank file
rnk_fname1 = 'KO-WT.rnk'
rnk_fname2 = 'DFMO-WT.rnk'
gene_value1 = load_file.read_rank(rnk_fname1)
gene_value2 = load_file.read_rank(rnk_fname2)


In [None]:
### load GSEA data 
GSEA_path1 = "NEW_GSEA_inht_GOBP"
GSEA_path2 = "NEW_GSEA_KO_GOBP"
pos_gs_list1, neg_gs_list1 = load_file.read_GSEA(GSEA_path1)
pos_gs_list2, neg_gs_list2 = load_file.read_GSEA(GSEA_path2)

In [None]:
### select gene-sets with the biggest |nes|
num = 20
gs_list1 = select_best(pos_gs_list1 + neg_gs_list1, num)
gs_list2 = select_best(pos_gs_list2 + neg_gs_list2, num)

In [None]:
### hierarchial clustering gene-sets based on the degree of sharing genes using Kappa-metric
gs_list = []
for gs in gs_list1 + gs_list2:
    if gs not in gs_list:
        gs_list.append(gs)
        
gene_list, kappa_matrix = get_kappa_matrix (gs_list)
dist_matrix = kappa_to_dist (kappa_matrix)
Z, idx_cID, cID_idxs = statis.Hierarchial_clustering (dist_matrix)

In [None]:
### color gene-set names based on the cluster groups
cID_color = ['tab:orange', 'tab:green', 'tab:red']
gsname_label, gsname_color = {}, {}
gsnames, gslabels = [], []
for gs, cID in zip(gs_list, idx_cID):
    gsname = gs['name']
    label = clean_gsname(gsname)
    color = cID_color[cID]
    gsname_label[gsname] = label
    gsname_color[gsname] = color
    gsnames.append(gsname)
    gslabels.append(label)

In [None]:
# set node color of dendrogram
node_children, node_dist, node_names = decode_Z(Z, gsnames)

node_color = {}
for node in node_names:
    names = list(node_names[node])
    
    colors = set([])
    for name in names:
        color = gsname_color[name]
        colors.add(color)
    colors = list(colors)

    if len(colors) == 1:
        node_color[k] = colors[0]
    else:
        node_color[k] = 'blue'


In [None]:
### plot dendrogram
graphics.plot_dendrogram(Z,
                         gslabels,
                         node_color=node_color,
                         name_color={})

In [None]:
### plot GSEA
# DFMO vs WT
plot_GSEA (gene_value1,
           gs_list1,
           gsname_label=gsname_label,
           gsname_color=gsname_color)

# KO vs WT
plot_GSEA (gene_value2,
           gs_list2,
           gsname_label=gsname_label,
           gsname_color=gsname_color)

