<a href="https://colab.research.google.com/github/sowmyamanojna/CS6024-Algorithmic-Approaches-to-Computational-Biology-Project/blob/master/codes/generate_msigdb_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Fair Warning. Please try the code in Colab only. :)

# Processing MSigDB Gene Sets into Binary Matrix

This notebook loads the full MSigDB gene set `.gmt` file (version 6.1) and outputs a binary, gene by gene set matrix indicating gene membership in the given gene set.

**Note that we exclude gene sets with restrictive licences (KEGG, Biocarta, and The AAAS/STKE Cell Signaling Database)**

In [None]:
import os
import csv
import numpy as np
import pandas as pd

In [None]:
def make_template_matrix(msigdb_file, blacklist, checkblacklist=True):
    """
    Retrieve all genes and pathways from given msigdb .gmt file
    
    Output:
    sorted gene by pathways pandas dataframe. Entries indicate membership
    """
    all_db_pathways = []
    all_db_genes = []

    # Get a set of all genes and all pathways in MSigDB (not blacklisted)
    with open(msigdb_file, 'r') as msigdb_fh:
        msigdb_reader = csv.reader(msigdb_fh, delimiter='\t')

        for row in msigdb_reader:
            signature_name = row[0]
            signature_genes = row[2:]
            
            if checkblacklist:
                if signature_name.startswith(blacklist):
                    continue

            all_db_pathways.append(signature_name)
            all_db_genes += signature_genes
        
    big_msigdb_df = pd.DataFrame(0, index=set(all_db_genes), columns=all_db_pathways)
    big_msigdb_df = big_msigdb_df.sort_index()
    big_msigdb_df = big_msigdb_df.T.sort_index().T
    
    # Loop through file again to populate dataframe. This is a fast implementation
    with open(msigdb_file, 'r') as msigdb_fh:
        msigdb_reader = csv.reader(msigdb_fh, delimiter='\t')
        for row in msigdb_reader:
            signature_name = row[0]
            signature_genes = row[2:]
            if checkblacklist:
                if signature_name.startswith(blacklist):
                    continue

            for gene in signature_genes:
                big_msigdb_df.at[gene, signature_name] = 1

    return big_msigdb_df

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Store .gmt files
full_msigdb_file = os.path.join('/content/drive/MyDrive/aacb_project/data/', 'msigdb.v6.1.entrez.gmt')
print(full_msigdb_file)

# Resources with restrictive licenses
blacklist = ('KEGG', 'BIOCARTA', 'ST_')

/content/drive/MyDrive/aacb_project/data/msigdb.v6.1.entrez.gmt


## Process MSigDB gmt files into large matrix

In [None]:
%%time
full_msigdb_df = make_template_matrix(full_msigdb_file, blacklist, checkblacklist=True)
print(full_msigdb_df.shape)

(32431, 17356)
CPU times: user 24.6 s, sys: 1.98 s, total: 26.6 s
Wall time: 27.4 s


In [None]:
%%time
full_msigdb_file = os.path.join('/content/drive/MyDrive/aacb_project/data/', 'full_msigdb_binary_matrix.tsv.bz2')
full_msigdb_df.to_csv(full_msigdb_file, sep='\t', compression='bz2')

CPU times: user 7min 39s, sys: 436 ms, total: 7min 39s
Wall time: 7min 40s


In [None]:
full_msigdb_df

Unnamed: 0,AAACCAC_MIR140,AAAGACA_MIR511,AAAGGAT_MIR501,AAAGGGA_MIR204_MIR211,AAANWWTGC_UNKNOWN,AAAYRNCTG_UNKNOWN,AAAYWAACM_HFH4_01,AACATTC_MIR4093P,AACGGTT_MIR451,AACTGAC_MIR223,AACTGGA_MIR145,AACTTT_UNKNOWN,AACWWCAANK_UNKNOWN,AACYNNNNTTCCS_UNKNOWN,AAGCAAT_MIR137,AAGCACA_MIR218,AAGCACT_MIR520F,AAGCCAT_MIR135A_MIR135B,AAGGGAT_MIR188,AAGTCCA_MIR422B_MIR422A,AAGWWRNYGGC_UNKNOWN,AATGGAG_MIR136,AATGTGA_MIR23A_MIR23B,ABBUD_LIF_SIGNALING_1_DN,ABBUD_LIF_SIGNALING_1_UP,ABBUD_LIF_SIGNALING_2_DN,ABBUD_LIF_SIGNALING_2_UP,ABDELMOHSEN_ELAVL4_TARGETS,ABDULRAHMAN_KIDNEY_CANCER_VHL_DN,ABDULRAHMAN_KIDNEY_CANCER_VHL_UP,ABE_INNER_EAR,ABE_VEGFA_TARGETS,ABE_VEGFA_TARGETS_2HR,ABE_VEGFA_TARGETS_30MIN,ABRAHAM_ALPC_VS_MULTIPLE_MYELOMA_DN,ABRAHAM_ALPC_VS_MULTIPLE_MYELOMA_UP,ABRAMSON_INTERACT_WITH_AIRE,ACAACCT_MIR453,ACAACTT_MIR382,ACACTAC_MIR1423P,...,chr8q11,chr8q12,chr8q13,chr8q21,chr8q22,chr8q23,chr8q24,chr9p,chr9p11,chr9p12,chr9p13,chr9p21,chr9p22,chr9p23,chr9p24,chr9q12,chr9q13,chr9q21,chr9q22,chr9q31,chr9q32,chr9q33,chr9q34,chrxp11,chrxp21,chrxp22,chrxq,chrxq11,chrxq12,chrxq13,chrxq21,chrxq22,chrxq23,chrxq24,chrxq25,chrxq26,chrxq27,chrxq28,chryp11,chryq11
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10000,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9992,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9993,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9994,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
