In [101]:
import urllib
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import umap
import itertools

from maayanlab_bioinformatics.enrichment import enrich_crisp

from sklearn.decomposition import NMF

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider, Range1d
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()

In [102]:


    'Aging_Perturbations_from_GEO_down',
    'Aging_Perturbations_from_GEO_up',
    'Disease_Perturbations_from_GEO_down',
    'Disease_Perturbations_from_GEO_up',
    'Drug_Perturbations_from_GEO_down',
    'Drug_Perturbations_from_GEO_up',
    'Gene_Perturbations_from_GEO_down',
    'Gene_Perturbations_from_GEO_up',
    'Ligand_Perturbations_from_GEO_down',
    'Ligand_Perturbations_from_GEO_up',
    'MCF7_Perturbations_from_GEO_down',
    'MCF7_Perturbations_from_GEO_up',
    'Microbe_Perturbations_from_GEO_down',
    'Microbe_Perturbations_from_GEO_up',
    'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
    'SysMyo_Muscle_Gene_Sets'

    ## NOT VIRUS HOST

    # take a closer look at:
    'GO_Biological_Process_2018'
    ['MGI_Mammalian_Phenotype_Level_4_2019']
    'DisGeNET'
    'DrugMatrix'

    'LINCS_L1000_Chem_Pert_down'
    'LINCS_L1000_Chem_Pert_up'
    'Old_CMAP_down'
    'Old_CMAP_up'
    'Genes_Associated_with_NIH_Grants'
    'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions'
    'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions'
    'NIH_Funded_PIs_2017_Human_AutoRIF'
    'NIH_Funded_PIs_2017_Human_GeneRIF'
    'BioPlex_2017'

    # redo?:

    
    
    


'BioPlex_2017'

In [103]:
all_libraries = ['Jensen_TISSUES']
#genes = ['TP53', 'TNF', 'EGFR', 'GKN1', 'HADHA', 'APOE', 'ESR1', 'VEGFA', 'TGFB1', 'PREPL', 'TIA1', 'TPO', 'TTN', 'SATB2', 'CHPF', 'MALL', 'MIPIP', 'NUPL1', 'IL6', 'PDIA3', 'CTNNB1', 'SLC39A1', 'DTNA','SLC1A1', 'GALNT2', 'HIST2H2AC', 'CD63']

# open_gene_list_file = open('geneList.txt','r')
# lines = open_gene_list_file.readlines()
# genes = [x.strip().upper() for x in lines]
# open_gene_list_file.close()

significance_value = 0.05

In [104]:
# open Enrichr library from online
def get_Enrichr_library(library_index):
    # processes library data
    raw_library_data = []
    library_data = []

    with urllib.request.urlopen('https://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=text&libraryName=' + all_libraries[library_index]) as f:
        for line in f.readlines():
                raw_library_data.append(line.decode("utf-8").split("\t\t"))

    name = []
    gene_list = []

    for i in range(len(raw_library_data)):
        name += [raw_library_data[i][0]]
        raw_genes = raw_library_data[i][1].replace('\t', ' ')
        gene_list += [raw_genes[:-1]]

    library_data = [list(a) for a in zip(name, gene_list)]
    
    return library_data

In [105]:
library_data = get_Enrichr_library(0)

df = pd.DataFrame(data = library_data, columns = ['Name', 'Genes'])

gene_list = df['Genes']

tfidf_vectorizer = TfidfVectorizer(
    min_df = 3,
    max_df = 0.005,
    max_features = 100000,
    ngram_range=(1, 1)
)

tfidf = tfidf_vectorizer.fit_transform(gene_list)

# Save the feature names for later to create topic summaries
tfidf_fn = tfidf_vectorizer.get_feature_names()

# plot after tfidf

reduce = umap.UMAP()
reduce.fit(tfidf)
embedding = reduce.transform(tfidf)

embedding = pd.DataFrame(embedding, columns=['x','y'])

source1 = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            alpha = [0.7] * embedding.shape[0],
            size = [7] * embedding.shape[0],
            gene_set = df['Name']
        )
    )

print(embedding.shape[0])

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        </div>
    </div>
    """)
tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset']
plot_emb = figure(plot_width=700, plot_height=700, tools=tools_emb)
plot_emb.circle('x', 'y', size='size', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source1, name="df")

show(plot_emb)

df = pd.concat([embedding, df], axis=1)
df.to_csv('Libraries/' + all_libraries[0] + '.csv', index = False)

1842


In [96]:
# create and run NMF model

n_comp = 200

nmf = NMF(
    n_components=n_comp, 
    max_iter=1000000,
    alpha=0.0
)

W = nmf.fit_transform(tfidf)
H = nmf.components_
nmf_embedding = nmf.transform(tfidf)

In [97]:
# enrichment analysis
def get_library_iter(library_data):
    for member in library_data:
        term = member[0]
        gene_set = member[1].split(' ')
        yield term, gene_set

def get_enrichment_results(genes, library_data):
    return sorted(enrich_crisp(genes, get_library_iter(library_data), 20000, True), key=lambda r: r[1].pvalue)

def get_pvalue(row, unzipped_results, all_results):
    if row['Name'] in list(unzipped_results[0]):
        index = list(unzipped_results[0]).index(row['Name'])
        return all_results[index][1].pvalue
    else:
        return 1

In [98]:
# call UMAP

reducer = umap.UMAP()
reducer.fit(W)
embedding = reducer.transform(W)

embedding = umap.UMAP().fit_transform(tfidf.todense())
embedding = pd.DataFrame(embedding, columns=['x','y'])

# combine embedding with df
df = pd.concat([embedding, df], axis=1)
df.to_csv('Libraries/' + all_libraries[0] + '.csv', index = False)

In [99]:
# # call enrichment results
# all_results = get_enrichment_results(genes, library_data)
# unzipped_results = list(zip(*all_results))

# # add p value to the dataframe
# df['p value'] = df.apply (lambda row: get_pvalue(row, unzipped_results, all_results), axis=1)

# my_colors = []
# for index, row in df.iterrows():
#     if row['p value'] < significance_value:
#         my_colors += ['#000000']
#     else:
#         my_colors += [all_palettes['Category20'][20][0]]

# embedding['hue'] = nmf_embedding.argmax(axis = 1)
# my_colors = [all_palettes['Category20'][20][i] for i in embedding.hue]

source = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            gene_set = df['Name'],
            #p_value = df['p value'],
            #colors = my_colors
        )
    )

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
            <span style="font-size: 12px; font-weight: bold;">p-value:</span>
            <span style="font-size: 12px">@p_value</span>
        </div>
    </div>
    """)
tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset']
plot_emb = figure(plot_width=700, plot_height=700, tools=tools_emb)
plot_emb.circle('x', 'y', size = 7, alpha = 0.7, line_alpha = 0, 
                line_width = 0.01, source = source, name = "df")
# fill_color = 'colors',

show(plot_emb)

In [100]:
# this will print out the most common genes in each group
# not super useful but kind of interesting
n_topics = n_comp
n_top_words = 15

print("Topics found via NMF:")
for topic_idx, topic in enumerate(nmf.components_):
    print("\nTopic {}:".format(topic_idx+1))
    print(" ".join(['[{}]'.format(tfidf_fn[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()

f1]

Topic 60:
[il17f] [defb4b] [crlf2] [lta] [il28b] [mica] [havcr1] [125b] [kir3dl2] [cyp27b1] [defb103a] [linc] [il9] [ccl27] [150]

Topic 61:
[c6orf15] [ty] [gsx2] [kcnk4] [il36g] [defb103a] [trs] [scxb] [pou4f2] [pou4f1] [lctl] [lhx4] [gdpd4] [a4gnt] [smr3a]

Topic 62:
[vax2] [slc6a18] [mpp4] [znf843] [lhx4] [fbxo39] [krtap13] [lrrc30] [sel1l2] [rp1l1] [obp2b] [or7c1] [prkag3] [or5k4] [tfap2e]

Topic 63:
[mybph] [al354822] [tmprss11e] [rab40a] [fam25b] [fam25a] [fam25c] [pramef10] [prkag3] [defb4b] [hcrtr2] [usp26] [oxgr1] [fbxo40] [cdh16]

Topic 64:
[f5h8k0] [slco1b3] [slco1b1] [slc22a6] [cyp4a11] [gcgr] [sepp1] [gpx6] [scxb] [stk24] [mip] [slc17a1] [gja3] [ghrh] [slc22a12]

Topic 65:
[foxe1] [ugt1a1] [tas1r2] [il36g] [defb103a] [il28b] [smcp] [cdh16] [spata4] [tmem105] [h0y5s2] [calcb] [grik4] [grk1] [klhdc7a]

Topic 66:
[trt] [hcrtr2] [c2orf50] [dcdc2b] [tctex1d4] [tmem89] [mrgprg] [or11h1] [or5j2] [rfx8] [c12orf74] [c2orf48] [a8mv45] [pramef19] [pramef22]

Topic 67:
[29c] [zpb