In [3]:
from dataclasses import dataclass
from pathlib import Path

import umap
import sklearn
import pandas as pd
from sklearn.preprocessing import StandardScaler

import bokeh
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
Each file ending in `*_residual expression.tsv` contains expression data for a given gene listed in the columns. Each file lists different genes so we need to concatenate each dataframe initialising missing columns to `0`.

In [2]:
DATA_DIR = Path('../association/onek1k_website_data/')

In [1]:
dfs = {
    expression_file.name.split('_')[0].strip(): pd.read_csv(expression_file, header=0, delimiter='\t')
    for expression_file in DATA_DIR.glob('*_residual_expressions.tsv')
}

cell_types = list(dfs.keys())
genes = list(set(g for df in dfs.values() for g in df.columns ))

for gene in genes:
    for cell_type, df in dfs.items():
        df['cell_type'] = cell_type
        if gene not in df.columns:
            df[gene] = 0
            
expression = pd.concat(dfs.values(), axis=0, ignore_index=True)

NameError: name 'DATA_DIR' is not defined

# UMAP embedding

In [6]:
def compute_umap_embedding(data, genes, **kwargs):
    reducer = umap.UMAP(**kwargs)

    expression_data = data[genes].values
    scaled_expression_data = StandardScaler().fit_transform(expression_data)

    embedding = reducer.fit_transform(scaled_expression_data)
    assert embedding.shape == (data.shape[0], 2)
    
    return embedding

embedding = compute_umap_embedding(data=expression, genes=genes, min_dist=0.1, n_neighbors=15, random_state=42, n_epochs=400)

# Bokeh interactive plot

In [9]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, Slider, CustomJS
from bokeh.palettes import Set3_12
from bokeh.layouts import column

output_notebook()

plot_figure = figure(
    title='UMAP projection of the 1k1 dataset',
    plot_width=1024,
    plot_height=1024,
    tools=('pan, wheel_zoom, reset')
)

plot_figure.add_tools(
    HoverTool(
        tooltips=
            """
            <div>
                <div>
                    <span style='font-size: 16px; color: #224499'>Cell Type:</span>
                    <span style='font-size: 18px'>@cell_type</span>
                </div>
            </div>
            """
    ),
)

In [10]:
df = pd.DataFrame(embedding, columns=('x', 'y'))
df['cell_type'] = expression.cell_type

datasource = ColumnDataSource(df)
color_mapping = CategoricalColorMapper(
    factors=list(dfs.keys()),
    palette=Set3_12
)

plot_figure_data = plot_figure.circle(
    'x',
    'y',
    source=datasource,
    color=dict(field='cell_type', transform=color_mapping),
    line_alpha=0.6,
    fill_alpha=0.6,
    size=4
)

layout = column(plot_figure)
show(layout)