In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from voronoi import draw_voronoi_scatter # code from Nolan lab

# 1. Patient info
* Group 1: Crohn's like reaction (CLR). Better survival than group 2.
* Group 2: diffuse inflammatory infiltration (DII). 
* total of 35 patients
* Each patient provided 4 tissue samples, resulting a total count of 140. 
    * In the TMA (tissue microarray), the four samples provided by each patient is located via `TMA spot/region`.
    * Note that in each TMA spot/region, there are two cores, which, in a later dataset, is identified by `A` and `B`.
    * To understand where each tissue sample came from, look at "Multi-tumor_TMA_composition.xlsx"

In [None]:
# load data
df_patient = pd.read_excel("CRC_TMAs_patient_annotations.xlsx")

# only keep patient info
df_patient = df_patient[:35]

In [None]:
df_patient.head()

In [None]:
df_patient["Group"].value_counts()

In [None]:
# load region info
df_TMA = pd.read_excel("Multi-tumor_TMA_composition.xlsx")

In [None]:
df_TMA

* Note that `df_TMA` has 70 rows, while our dataset consists of 140 samples. This is because each region (`reg001`) provides two samples. In the following section, dataframe `df_clusters` has a column `File Name`, which specifies the `A` and `B`. One should simply think of `reg001_A`, `reg001_B`, ... as indicating the location on the microarray.

data before extracting cell types and cell neighborhoods

In [None]:
df_orig = pd.read_csv('CRC_clusters_neighborhoods_markers.csv')

# 2. data1: cell types

* relevant columns
    * `groups`: 1 or 2
        * 1: Crohn's like reaction (CLR). Better survival than group 2.
        * 2: diffuse inflammatory infiltration (DII). 
    * `patients`: patient number. Between 1 and 35
    * `X:X`: x-coordinate of cell
    * `Y:Y`: y-coordinate of cell
    * `Region`: location on the microarray. `Region` column and the file `Multi-tumor_TMA_composition.xlsx`
    * `File Name:`: has format `reg0XY_A/B`.
        * `reg0XY`: `0XY` ranges from `001` to `069`. Indicates the location in the microarray. See supplementary info Data S2.
        * `A/B`: Two cores are selected from a tissue sample. So one sample is named `_A` and the other is named `_B`.
        * In total, there should be 4 values of `
    * 28 Columns containing cell types: `B cells`, `CD11b+ monocytes`, `CD11b+CD68+ macrophages`, `CD11c+ DCs`, `CD163+ macrophages`,`CD3+ T cells`,`CD4+ T cells`,`CD4+ T cells CD45RO+`,`CD4+ T cells GATA3+`,`CD68+ macrophages`,`CD68+ macrophages GzmB+`,`CD68+CD163+ macrophages`,`CD8+ T cells`,`NK cells`,`Tregs`,`adipocytes`,`dirt`,`granulocytes`, `immune cells`,`immune cells / vasculature`,`lymphatics`,`nerves`,`plasma cells`,`smooth muscle`,`stroma`, `tumor cells`,`tumor cells / immune cells`,`undefined`,`vasculature`
        * Each column is 0 or 1.
    

In [None]:
df_cells = pd.read_csv('CRC_celltypes.csv', index_col = None)

In [None]:
df_cells.head()

Plot individual cell type 

In [None]:
# select patient, print relevant file names
patient = 1
df_cells[df_cells["patients"] == patient]["File Name"].value_counts()

In [None]:
# select patient
cell_type = "tumor cells"
file_name = "reg002_B" # Select one of the file names printed above                     

# extract relevant rows
cells_patient = df_cells[(df_cells["patients"] == patient) 
                         & (df_cells[cell_type] == 1)
                         & (df_cells["File Name"] == file_name)]

# get (x,y) coordinates
x = cells_patient["X:X"].values
y = cells_patient["Y:Y"].values

# plot
plt.scatter(x, y, label = cell_type)
plt.legend()
plt.show()

Plot all cells

In [None]:
cell_columns = ['B cells', 'CD11b+ monocytes', 'CD11b+CD68+ macrophages', 'CD11c+ DCs', 
                'CD163+ macrophages','CD3+ T cells','CD4+ T cells','CD4+ T cells CD45RO+',
                'CD4+ T cells GATA3+','CD68+ macrophages','CD68+ macrophages GzmB+','CD68+CD163+ macrophages',
                'CD8+ T cells','NK cells','Tregs','adipocytes','dirt','granulocytes', 'immune cells',
                'immune cells / vasculature','lymphatics','nerves','plasma cells','smooth muscle',
                'stroma', 'tumor cells','tumor cells / immune cells','undefined','vasculature']
fig, ax = plt.subplots()

for ct in cell_columns:
    # extract relevant rows
    df_ct = df_cells[(df_cells["patients"] == patient) 
                         & (df_cells[ct] == 1)
                         & (df_cells["File Name"] == file_name)]
    
    # get (x,y) coordinates
    x = df_ct["X:X"].values
    y = df_ct["Y:Y"].values
    
    ax.scatter(x,y, label = ct)
    
    # Add a legend
    pos = ax.get_position()
    ax.set_position([pos.x0, pos.y0, pos.width * 1, pos.height])
    ax.legend(loc='center right', bbox_to_anchor=(1.6, 0.5))

plt.show()

# 3. data2: Cell neighborhood
* The cell neighborhoods are extracted by looking at a cell neighborhood (of specific size) and computing the density of various cell types. 
* Relevant columns: `neighborhood10`
* The `10` indicates that the cell neighborhood extraction algorithm considered 10 neighboring cells
* The original data (before cell neighborhood extraction) comes with two columns (`neighborhood number final` and `neighborhood name`), which seems to encode neighborhood info extracted from some other preprocessing step. 
* In particular, `value_counts()` of `neighborhood number final` and `neighborhood name` should match up. They do not coincide with `neighborhood10`
* Neighborhood names:
    * 0: other
    * 1: T cell enriched
    * 2: Bulk tumor
    * 3: Immune-infiltrated stroma
    * 4: Macrophage enriched
    * 5: Follicle
    * 6: Tumor boundary
    * 7: Vascularized smooth muscle
    * 8: Smooth muscle
    * 9: Granulocyte enriched

In [None]:
# fix color palette
palette = {0:"#023ffe", 
           1:"#fe7d01", 
           2:"#1bc939", 
           3:"#e7010a", 
           4:"#8a2be2", 
           5:"#9f4801", 
           6:"#f04dc0", 
           7:"#a2a3a2", 
           8:"#ffc402", 
           9:"#00d6ff"}

Plot for one patient

In [None]:
# select patient
patient = 1
cell_type = "tumor cells"
file_name = "reg002_B"

# select data
data = df_cells[(df_cells["patients"] == patient) & (df_cells["File Name"] == file_name)]

# plot
sns.lmplot(data = data, x = 'X:X', y="Y:Y", hue = "neighborhood10", palette = palette, fit_reg = False)

In [None]:
df_cells[df_cells["groups"] == 1].groupby(by = ["patients"]).sum()

In [None]:
# plot for all group 1 patients
sns.lmplot(data = df_cells[df_cells['groups']==1],x = 'X:X',y='Y:Y',hue = 'neighborhood10',palette = palette,height = 8,col = "File Name",col_wrap = 10,fit_reg = False)



Note the above code generates 68 images
* 17 patients in group 1
* 2 regions per patient
* 2 cores per patient
* total: 17 * 4 = 68

In [None]:
# plot for all group 2 patients
sns.lmplot(data = df_cells[df_cells['groups']==2],x = 'X:X',y='Y:Y',hue = 'neighborhood10',palette = palette,height = 8,col = "File Name",col_wrap = 10,fit_reg = False)



Note the above code generates 72 images
* 18 patients in group 1
* 2 regions per patient
* 2 cores per patient
* total: 18 * 4 = 72

# 4. Voronoi diagram

In [None]:
spot = df_cells[df_cells['File Name']=='reg059_A']

_ = draw_voronoi_scatter(spot,[],)