<a href="https://colab.research.google.com/github/semenko/liquid-cell-atlas/blob/main/altair_visualizations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Necessary Libraries

In [2]:
import numpy as np
import altair as alt
import pandas as pd

# Generate Ground Truth Dictionaries

In [3]:
# PBMC Ground Truth Internal Data:

CELL_TYPES = ['T_CD4_naive', 'T_CD4_central_memory', 'T_CD4_effector_memory', 'T_CD8_naive', 'T_CD8_central_memory', 'T_CD8_effector_memory', 'T_CD8_effector_memory_terminal', 'Treg', 'NK_Cell', 'B_Cell_naive', 'B_Cell_memory', 'Plasma_Cell', 'Monocyte', 'Dendritic']
CELL_TYPES_DOTS = [i.replace('_', '.') for i in CELL_TYPES]

## PBMC Mixture --> Ground Truth Percentages
# This maps sample ID to cell type percentage.
PBMC_MIXTURE_TRUTH_BY_CYTOF = {
    '03': [11.42,9.7,9.5,8.9,1.9,8.9,5.4,0.29,8.3,5.92,1.86,0.02,7.54,1.8],
    '14': [7.83,7,6.9,9.96,1.2,10.3,2.8,0.14,12.7,8.44,2.05,0.01,6.65,1.3],
    '24': [15.1,12.2,14.6,8.63,1,7.2,4.5,0.5,7.08,4.92,1.82,0.05,4.12,1.22],
    '31': [10.8,6.9,9.9,8.4,0.8,7.3,4.9,0.12,14.96,5.58,1.99,0.01,5.13,1.25],
    '49': [9.84,8.4,16.6,10.8,1.2,8.2,4.5,0.19,5.49,7.64,2.00,0.01,7.63,1.12],
    '50': [6.2,4.3,5.9,4.04,0.9,3.5,1.8,0.27,9.43,5.66,1.21,0.13,7.14,1.75],
    '51': [10.4,6.1,7.1,5.2,0.4,3,2.1,0.18,14.17,6.50,4.39,0.02,10.5,1.65],
}

# Using "Second Flow" in Abul's. spreadsheet
PBMC_MIXTURE_TRUTH_BY_FLOW_SECOND = {
    '03': [11.55,6.910,7.244,21.07,None,None,None,0.34,4.45,5.70,2.36,None,6.31,1.16],
    '14': [8.56,6.504,3.649,19.89,None,None,None,0.22,5.15,7.23,5.29,None,5.78,0.81],
    '24': [24.75,9.305,5.088,14.26,None,None,None,0.44,5.02,4.76,1.99,None,5.02,1.12],
    '31': [13.5,6.967,4.343,14.77,None,None,None,0.18,4.98,2.23,0.97,None,3.22,0.88],
    '49': [13.68,7.324,8.396,21.25,None,None,None,0.16,3.9,6.73,3.27,None,8.8,0.5],
    '50': [8.56,5.823,2.675,6.12,None,None,None,0.22,6.14,5.76,1.25,None,15.9,0.83],
    '51': [10.61,5.612,3.489,9.11,None,None,None,0.21,10.12,3.83,3.55,None,10.21,1.02],
}

# Using "First Flow" in Abul's. spreadsheet
# One flow experiment with missing samples and no corresponding CyTOF
PBMC_MIXTURE_TRUTH_BY_FLOW_FIRST = {
    '03': [13.95,10.146,4.844,24.16,None,None,None,0.28,6.2,5.25,2.29,None,5.43,0.69],
    '14': [11.51,8.508,3.232,23.85,None,None,None,0.12,11.99,4.73,4.29,None,4.94,0.44],
    '31': [15.48,8.244,3.624,23.93,None,None,None,0.11,14.17,5.58,2.94,None,7.77,0.35],
    '49': [12.74,8.389,6.605,23.95,None,None,None,0.19,3.74,6.96,3.06,None,6.39,0.34],
    '50': [14.53,7.674,1.654,10.83,None,None,None,0.14,6.16,8.99,2.14,None,22.45,0.52],
}

assert all([len(e) == len(CELL_TYPES) for e in PBMC_MIXTURE_TRUTH_BY_CYTOF.values()])
assert all([len(e) == len(CELL_TYPES) for e in PBMC_MIXTURE_TRUTH_BY_FLOW_SECOND.values()])
assert all([len(e) == len(CELL_TYPES) for e in PBMC_MIXTURE_TRUTH_BY_FLOW_FIRST.values()])

# Nested dicts, so you can run:
#  PBMC_TRUTH_DICT_FLOW['03']['Treg']
PBMC_TRUTH_DICT_FLOW_FIRST = {k:dict.fromkeys(CELL_TYPES_DOTS, v) for k,v in PBMC_MIXTURE_TRUTH_BY_FLOW_FIRST}
PBMC_TRUTH_DICT_FLOW_SECOND = {k:dict.fromkeys(CELL_TYPES_DOTS, v) for k,v in PBMC_MIXTURE_TRUTH_BY_FLOW_SECOND}
PBMC_TRUTH_DICT_CYTOF = {k:dict.fromkeys(CELL_TYPES_DOTS, v) for k,v in PBMC_MIXTURE_TRUTH_BY_CYTOF}

# Generate Random Predictions

In [4]:
# Generate random predictions
np.random.seed(seed = 2022)

PBMC_MIXTURE_PRED_BY_FLOW_FIRST = {}
for patient in PBMC_MIXTURE_TRUTH_BY_FLOW_FIRST.keys():
    prob_list = []
    for i in range(len(CELL_TYPES)):
        prob_list.append(np.random.uniform(0,1))
    prob_list = 100 * np.array(prob_list) / np.sum(prob_list)
    PBMC_MIXTURE_PRED_BY_FLOW_FIRST[patient] = list(prob_list)  

PBMC_MIXTURE_PRED_BY_FLOW_SECOND = {}
for patient in PBMC_MIXTURE_TRUTH_BY_FLOW_SECOND.keys():
    prob_list = []
    for i in range(len(CELL_TYPES)):
        prob_list.append(np.random.uniform(0,1))
    prob_list = 100 * np.array(prob_list) / np.sum(prob_list)
    PBMC_MIXTURE_PRED_BY_FLOW_SECOND[patient] = list(prob_list)     

PBMC_MIXTURE_PRED_BY_CYTOF = {}
for patient in PBMC_MIXTURE_TRUTH_BY_CYTOF.keys():
    prob_list = []
    for i in range(len(CELL_TYPES)):
        prob_list.append(np.random.uniform(0,1))
    prob_list = 100 * np.array(prob_list) / np.sum(prob_list)
    PBMC_MIXTURE_PRED_BY_CYTOF[patient] = list(prob_list) 

Calculate the difference between the predictions and ground truth

In [5]:
PBMC_MIXTURE_DIFF_BY_FLOW_FIRST = {}
for patient in PBMC_MIXTURE_TRUTH_BY_FLOW_FIRST:
    truth_arr = [0 if val is None else val for val in PBMC_MIXTURE_TRUTH_BY_FLOW_FIRST[patient]]
    PBMC_MIXTURE_DIFF_BY_FLOW_FIRST[patient] = list(np.array(truth_arr) - np.array(PBMC_MIXTURE_PRED_BY_FLOW_FIRST[patient]))

PBMC_MIXTURE_DIFF_BY_FLOW_SECOND = {}
for patient in PBMC_MIXTURE_TRUTH_BY_FLOW_SECOND:
    truth_arr = [0 if val is None else val for val in PBMC_MIXTURE_TRUTH_BY_FLOW_SECOND[patient]]
    PBMC_MIXTURE_DIFF_BY_FLOW_SECOND[patient] = list(np.array(truth_arr) - np.array(PBMC_MIXTURE_PRED_BY_FLOW_SECOND[patient]))

PBMC_MIXTURE_DIFF_BY_CYTOF = {}
for patient in PBMC_MIXTURE_TRUTH_BY_CYTOF:
    truth_arr = [0 if val is None else val for val in PBMC_MIXTURE_TRUTH_BY_CYTOF[patient]]
    PBMC_MIXTURE_DIFF_BY_CYTOF[patient] = list(np.array(truth_arr) - np.array(PBMC_MIXTURE_PRED_BY_CYTOF[patient]))

# Calculate Statistics

Mean squared error over all three, per cell type

In [71]:
flow_first_diff = np.array(list((PBMC_MIXTURE_DIFF_BY_FLOW_FIRST.values())))
flow_second_diff = np.array(list(PBMC_MIXTURE_DIFF_BY_FLOW_SECOND.values()))
cytof_diff = np.array(list(PBMC_MIXTURE_DIFF_BY_CYTOF.values()))
combined_diff = np.concatenate((flow_first_diff, flow_second_diff, cytof_diff), axis = 0)

mean_dev = np.mean(combined_diff, axis = 0)                  
std_dev = np.std(combined_diff, axis = 0)

data = [[cell, mean_dev[i], std_dev[i]] for i, cell in list(enumerate(CELL_TYPES))]
dev_data = pd.DataFrame(data, columns = ["Cell Types", "Mean Difference", "Standard Deviation of Diff."])
dev_data

Unnamed: 0,Cell Types,Mean Difference,Standard Deviation of Diff.
0,T_CD4_naive,7.322036,5.691704
1,T_CD4_central_memory,0.686845,5.177019
2,T_CD4_effector_memory,-0.956382,3.78216
3,T_CD8_naive,6.28335,8.76413
4,T_CD8_central_memory,-7.68374,4.728944
5,T_CD8_effector_memory,-4.51064,5.195738
6,T_CD8_effector_memory_terminal,-4.772554,4.346269
7,Treg,-6.384239,4.718628
8,NK_Cell,-0.517222,4.353026
9,B_Cell_naive,-1.947851,3.964545


# Plot Visualizations

Plot Grouped Bar Graphs for each patient (Flow First Data)

In [7]:
df = pd.DataFrame(PBMC_MIXTURE_TRUTH_BY_FLOW_FIRST)
df["Cell Type"] = CELL_TYPES
df["Truth/Pred"] = "Truth"

df2 = pd.DataFrame(PBMC_MIXTURE_PRED_BY_FLOW_FIRST)
df2["Cell Type"] = CELL_TYPES
df2["Truth/Pred"] = "Prediction"

data = pd.concat([df, df2]).fillna(0)

In [9]:
# Still figuring out how to move the labels to the bottom
flag = True

for patient in PBMC_MIXTURE_PRED_BY_FLOW_FIRST: 
    title = alt.TitleParams('Patient {}: Truth vs Predictions for Cell Type Percentages(Flow First Data)'.format(patient), anchor='middle')
    chart = alt.Chart(data, title = title).mark_bar().encode(
        x=alt.X('Truth/Pred', title = ''),
        y=alt.Y(patient, title = 'Percent'),
        color=alt.Color('Truth/Pred'),
        column=alt.Column('Cell Type', title = '', header = alt.Header(orient='bottom', labelAngle=-45, labelAlign='left', labels = flag))
    ).configure_axisX(disable = True)

    chart.display()
    flag = False

In [74]:
flow_first_diff

array([[ 13.83933783,   4.24482487,   3.50327955,  23.56907561,
         -8.10469281,  -5.75845486, -10.61446668,  -7.3758827 ,
         -4.40625916,  -3.27715541,  -7.54044855,  -9.78568816,
         -4.42677211, -10.62669741],
       [  6.08106983,   1.2087756 ,  -1.77601817,  14.71295507,
        -14.41929183,  -1.42245988, -10.97759713,  -4.19458557,
          7.58431563,  -6.36954539,   4.01469609,  -7.72551812,
         -7.81107399,  -5.29572214],
       [ 12.40083373,   1.34854006,  -4.57009543,  18.85934858,
        -14.16164163,  -0.54881563, -11.52583587,  -5.08331935,
          3.31611762,  -7.69066153,  -2.46779285, -14.00165301,
          6.58899174,  -0.2660164 ],
       [  7.47042152,   1.93148512,  -2.09942269,  12.08052883,
        -11.95639714, -10.15757924,  -1.43090575, -15.75976541,
          3.05841624,  -6.92776948,   2.40917072,  -4.74381763,
          2.74627488,  -4.25663998],
       [  0.47533605,   7.55155626,  -4.42640641,  -2.64426561,
         -3.07746323

In [76]:
diff_data = np.concatenate(([x for xs in flow_first_diff for x in xs],
                            [x for xs in flow_second_diff for x in xs],
                            [x for xs in cytof_diff for x in xs]))

diff_dataf = pd.DataFrame(diff_data, columns = ["Percentage"])
diff_dataf["Cell Type"] = (len(diff_dataf) // len(CELL_TYPES)) * CELL_TYPES

In [80]:
title = alt.TitleParams('Box-whisker Plot: Difference between Truth and Prediction per Cell Type', anchor='middle')
alt.Chart(diff_dataf, title = title).mark_boxplot(extent='min-max').encode(
    x=alt.X('Cell Type'),
    y=alt.Y('Percentage', title = "Percentage Difference")
)