# 5.0 10X Genomics PBMC 3K Dataset

In [1]:
from clustergrammer import *
net = Network()
df = {}

import clustergrammer_widget2
import clustergrammer_groupby as cby
import gene_exp_10x

In [2]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline 

### Load Data

In [3]:
df = gene_exp_10x.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')
df.shape

(32738, 2700)

### Remove Ribosomal and Mitochondrial Genes

In [4]:
all_genes = df.index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))

df = df.loc[keep_genes]
df.shape

# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
                'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']

all_genes = df.index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or 
             x.split('_')[0] in list_mito_genes]
print(mito_genes)

keep_genes = [x for x in all_genes if x not in mito_genes]
df = df.loc[keep_genes]

# # normalize by UMI count
# barcode_umi_sum = df['ge'].sum()
# df['ge'] = df['ge'].div(barcode_umi_sum)

32738
32546
['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L7', 'MTRNR2L5', 'MTRNR2L8', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']


In [5]:
def calc_mean_var_disp(df_inst):
    mean_arr = []
    var_arr = []
    mean_names = []
    for inst_gene in df_inst.index.tolist():
        mean_arr.append( df_inst.loc[inst_gene].mean() )
        var_arr.append(df_inst.loc[inst_gene].var())
        mean_names.append(inst_gene)

    ser_mean = pd.Series(data=mean_arr, index=mean_names)
    ser_var = pd.Series(data=var_arr, index=mean_names)    
    return ser_mean, ser_var

### Keep top 5000 expressing genes

In [6]:
ser_mean, ser_var = calc_mean_var_disp(df)

keep_genes = ser_mean.sort_values(ascending=False)[:5000].index.tolist()

df = df.loc[keep_genes]
print(df.shape)

(5000, 2700)


### Keep top 1000 Genes by Dispersion and Randomly Sample Cells

In [7]:
num_genes = 1000
num_cells = 1000

# keep top genes based on dispersion
ser_mean, ser_var = calc_mean_var_disp(df)
ser_disp = ser_var.divide(ser_mean).sort_values(ascending=False)
keep_genes = ser_disp[:num_genes].index.tolist()
df = df.loc[keep_genes]

# randomly sample cells
df = df.sample(axis=1, n=num_cells, random_state=99)

### Arcsin Transform

In [8]:
df.shape
df = np.arcsinh(df/5)

### Z-score and Subset of Data

In [9]:
net.load_df(df)
net.normalize(axis='row', norm_type='zscore')
df = net.export_df()
df.shape

(1000, 1000)

# Visualize Data in Clustergrammer-Widget2

In [10]:
net.load_df(df)
net.clip(lower=-5, upper=5)
net.cluster()
net_json = net.export_viz_to_widget()
w = clustergrammer_widget2.ExampleWidget(network=net_json)
w

ExampleWidget(network='{"row_nodes": [{"name": "IGJ", "ini": 1000, "clust": 461, "rank": 25, "rankvar": 171, "…

### Load CIBERSORT gene sigantures

In [11]:
net.load_file('../data/cell_type_signatures/nm3337_broad_cell_type_sigs.txt')
df_sig = net.export_df()
print(df_sig.shape)

(523, 9)


In [12]:
net.load_df(df_sig)
net.clip(lower=-5, upper=5)
net.cluster()
net_json = net.export_viz_to_widget()
w = clustergrammer_widget2.ExampleWidget(network=net_json)
w

ExampleWidget(network='{"row_nodes": [{"name": "ABCB4_12532", "ini": 523, "clust": 265, "rank": 158, "rankvar"…

# Predict Cell Types using NM3337 Signatures

In [13]:
rows = df_sig.index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df_sig_clean = deepcopy(df_sig)
df_sig_clean.index = new_rows

In [14]:
df_pred_cat, df_sig_sim, y_info = cby.predict_cats_from_sigs(df, df_sig_clean, 
                                                                   predict_level='Cell Type', unknown_thresh=0.05)

### Cell Clustering Based on CIBERSORT Signature Genes with CIBERSORT Signature Categories

In [15]:
# net.load_df(df_pred_cat)
# net.clip(lower=-5, upper=5)
# net.cluster()
# net_json = net.export_viz_to_widget()
# w = clustergrammer_widget2.ExampleWidget(network=net_json)
# w

In [16]:
df.columns = df_pred_cat.columns.tolist()

### Cell Clustering Based on top 1000 Genes by Disperson with CIBERSORT Signature Categories

In [17]:
net.load_df(df)
net.clip(lower=-5, upper=5)
net.cluster()
net_json = net.export_viz_to_widget()
w = clustergrammer_widget2.ExampleWidget(network=net_json)
w

ExampleWidget(network='{"row_nodes": [{"name": "IGJ", "ini": 1000, "clust": 461, "rank": 25, "rankvar": 171, "…