In [93]:
import requests
import pandas as pd
import plotly.graph_objects as go

In [94]:
def get_collection_info(collection_id):
    collection_info_response = requests.get(f"https://api.cellxgene.cziscience.com/dp/v1/collections/{collection_id}")
    collection_info = collection_info_response.json()
    return collection_info

In [95]:
# get data 
response = requests.get("https://api.cellxgene.cziscience.com/dp/v1/collections")
collections = response.json()["collections"]

data = [get_collection_info(c["id"]) for c in collections]

In [96]:
# create collection table 
collection_id = []
collection_dt = []
collection_name = []

for collection in data:
    collection_id.append(collection['id'])
    collection_dt.append(collection['created_at'])
    collection_name.append(collection['name'])

collections_df = pd.DataFrame({'collection_id': collection_id, 'collection_dt': collection_dt, 'collection_name': collection_name})
collections_df['collection_dt'] = pd.to_datetime(collections_df['collection_dt'] , unit='s')

In [97]:
# create dataset table 
keys_w_label = ['assay', 'development_stage', 'disease', 'ethnicity', 'tissue']
keys_wo_label = ['collection_id', 'id', 'name', 'sex', 'cell_count', 'created_at']
dataset_data = []

for c in data:
    for d in c['datasets']:
        keys = d.keys()
        keys_wo_labels_new = list(set(keys) & set(keys_wo_label))
        dict_updated = {key: d[key] for key in keys_wo_labels_new}
        keys_w_labels_new = list(set(keys) & set(keys_w_label))
        for key in keys_w_labels_new:
            dict_temp = {key: [i['label'] for i in d[key]]}
            dict_updated.update(dict_temp)
        dict_updated.update({'organism': d['organism']['label']})
        dataset_data.append(dict_updated)

datasets_df = pd.DataFrame(dataset_data)
datasets_df = datasets_df.rename(columns = {'id': 'dataset_id', 'name': 'dataset_name', 'created_at': 'dataset_dt'})
datasets_df['dataset_dt'] = pd.to_datetime(datasets_df['dataset_dt'], unit='s')

In [98]:
# create cells table
cells_df = datasets_df[['dataset_name', 'dataset_dt', 'cell_count']].drop_duplicates()

cells_df.loc[cells_df['dataset_name']=='Krasnow Lab Human Lung Cell Atlas, 10X', 'cell_count'] = 65662
cells_df.loc[cells_df['dataset_name']=='Krasnow Lab Human Lung Cell Atlas, Smart-seq2', 'cell_count'] = 9409
cells_df.loc[cells_df['dataset_name']=='Construction of a human cell landscape at single-cell level', 'cell_count'] = 599926
cells_df.loc[cells_df['dataset_name']=='Massively multiplex chemical transcriptomics at single-cell resolution - A549', 'cell_count'] = 143015
cells_df.loc[cells_df['dataset_name']=='Massively multiplex chemical transcriptomics at single-cell resolution - K562', 'cell_count'] = 146752
cells_df.loc[cells_df['dataset_name']=='Massively multiplex chemical transcriptomics at single-cell resolution - MCF7', 'cell_count'] = 292010
cells_df.loc[cells_df['dataset_name']=="Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: EC astrocytes", 'cell_count'] = 5500
cells_df.loc[cells_df['dataset_name']=="Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: EC excitatory neurons", 'cell_count'] = 8362
cells_df.loc[cells_df['dataset_name']=="Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: EC inhibitory neurons", 'cell_count'] = 5331
cells_df.loc[cells_df['dataset_name']=="Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: EC microglia", 'cell_count'] = 5572
cells_df.loc[cells_df['dataset_name']=="Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: EC oligodendrocyte", 'cell_count'] = 8168
cells_df.loc[cells_df['dataset_name']=="Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: SFG astrocytes", 'cell_count'] = 5970
cells_df.loc[cells_df['dataset_name']=="Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: SFG excitatory neurons", 'cell_count'] = 15833
cells_df.loc[cells_df['dataset_name']=="Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: SFG inhibitory neurons", 'cell_count'] = 7506
cells_df.loc[cells_df['dataset_name']=="Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: SFG microglia", 'cell_count'] = 3799
cells_df.loc[cells_df['dataset_name']== "Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: SFG oligodendrocyte", 'cell_count'] = 15772
cells_df.loc[cells_df['dataset_name']== "Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: caudal entorhinal cortex", 'cell_count'] = 42528
cells_df.loc[cells_df['dataset_name']== "Molecular characterization of selectively vulnerable neurons in Alzheimer's Disease: superior frontal gyrus", 'cell_count'] = 63608
cells_df.loc[cells_df['dataset_name']== 'Single-cell atlas of peripheral immune response to SARS-CoV-2 infection', 'cell_count'] = 44721
cells_df.loc[cells_df['dataset_name']== 'Single-cell gene expression profiling of SARS-CoV-2 infected human cell lines - Calu-3', 'cell_count'] = 48890
cells_df.loc[cells_df['dataset_name']== 'Single-cell gene expression profiling of SARS-CoV-2 infected human cell lines - H1299', 'cell_count'] = 81736
cells_df.loc[cells_df['dataset_name']== 'Single-cell longitudinal analysis of SARS-CoV-2 infection in human bronchial epithelial cells', 'cell_count'] = 77650

# Single cell curation metrics
Overall metrics
* total counts
* counts by month
* cumulative counts over time

Datasets by characteristics
- assay
- disease
- ethnicity
- development stage
- tissues
- organism
- sex


In [99]:
def plot_col_counts(df, col, orderby = 'count', flag = False):
    df_temp = df[[col]].explode(col).reset_index(drop = True)
    barplot(df_temp, col, orderby, flag)
    
def barplot(df, col, orderby = 'count', flag = False):
    df['count'] = 1
    df = df.groupby([col]).sum().reset_index()
    df = df.sort_values(by=[orderby], ascending = flag)
    
    fig = go.Figure([go.Bar(x=df[col].tolist(), y=df['count'].tolist(), 
                            text=df['count'].tolist(), textposition='outside')])
    
    
    title = 'Number of datasets by ' + col 
    fig.update_layout(
        autosize=False,
        width=1000,
        height=750,
        title=title
    )
    fig.update_xaxes(
        tickangle = -60,
        title_text = col
        )
    
    fig.update_yaxes(
        title_text = "number of datasets"
        )
    fig.show()
    
def create_trace(df, col):
    output = df[['date', col]]
    output = output.dropna()
    output['cum'] = output[col].cumsum()
    return (output['date'].tolist(), output['cum'].tolist())
    
def add_trace(fig, x_list, y_list, label):
    fig.add_trace(go.Scatter(x=x_list, y=y_list,
                    mode='lines+markers', name=label))
    
    

## Overall metrics

### Total counts

#### Number of collections in cellxgene

In [100]:
num_collections = len(pd.unique(collections_df['collection_id']))
print(num_collections)

39


#### Number of datasets in cellxgene

In [101]:
num_datasets = len(pd.unique(datasets_df['dataset_id']))
print(num_datasets)

161


#### Number of cells in cellxgene

In [102]:
num_cells = round(cells_df['cell_count'].sum())
print("{:,}".format(num_cells))

16,667,712


### Counts by month

In [103]:
# collections
collections_by_month = collections_df[['collection_dt', 'collection_id']]
collections_by_month['collections'] = 1
collections_by_month['month'] = collections_by_month['collection_dt'].dt.month
collections_by_month['year'] = collections_by_month['collection_dt'].dt.year
collections_by_month = collections_by_month[['month', 'year', 'collections']].groupby(['month', 'year']).sum().reset_index()

# datasets
datasets_by_month = datasets_df[['dataset_dt', 'dataset_id']]
datasets_by_month['datasets'] = 1
datasets_by_month['month'] = datasets_by_month['dataset_dt'].dt.month
datasets_by_month['year'] = datasets_by_month['dataset_dt'].dt.year
datasets_by_month = datasets_by_month[['month', 'year', 'datasets']].groupby(['month', 'year']).sum().reset_index()

# cells
cells_by_month = cells_df[['dataset_dt', 'cell_count']]
cells_by_month['month'] = cells_by_month['dataset_dt'].dt.month
cells_by_month['year'] = cells_by_month['dataset_dt'].dt.year
cells_by_month = cells_by_month[['month', 'year', 'cell_count']].groupby(['month', 'year']).sum().reset_index()

# merge tables together
overall = collections_by_month.merge(datasets_by_month, how = 'inner', on = ['month', 'year'])
overall = overall.merge(cells_by_month, how = 'inner', on=['month', 'year'])
overall.sort_values(by=['year', 'month']).reset_index(drop = True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,month,year,collections,datasets,cell_count
0,8,2020,4,8,865124.0
1,9,2020,2,2,644647.0
2,11,2020,1,12,187949.0
3,2,2021,2,5,113473.0
4,3,2021,17,73,4310858.0
5,4,2021,3,28,1146873.0
6,5,2021,5,6,194841.0
7,6,2021,1,5,7749204.0
8,7,2021,4,22,1454743.0


### Cumulative counts over time

In [104]:
# collections added over time
collections_df_temp = collections_df
collections_df_temp = collections_df_temp.sort_values(by=['collection_dt']).reset_index()
collections_df_temp['collection_dt'] = collections_df_temp['collection_dt'].dt.date
collections_df_temp['count'] = 1
collections_df_temp = collections_df_temp[['collection_dt', 'count']].groupby(['collection_dt']).sum().reset_index()
collections_df_temp['collections'] = collections_df_temp['count'].cumsum()

fig = go.Figure(go.Scatter(
    mode = "lines+markers",
    x = collections_df_temp['collection_dt'].tolist(),
    y = collections_df_temp['collections'].tolist()
    ))

fig.update_layout(
    title="Number of collections in cellxgene over time",
    xaxis_title="date",
    yaxis_title="number of collections"
)
             
fig.show()

In [105]:
datasets_df_temp = datasets_df
datasets_df_temp = datasets_df_temp.sort_values(by=['dataset_dt']).reset_index()
datasets_df_temp['dataset_dt'] = datasets_df_temp['dataset_dt'].dt.date
datasets_df_temp['count'] = 1
datasets_df_temp = datasets_df_temp[['dataset_dt', 'count']].groupby(['dataset_dt']).sum().reset_index()
datasets_df_temp['datasets'] = datasets_df_temp['count'].cumsum()

fig = go.Figure(go.Scatter(
    mode = "lines+markers",
    x = datasets_df_temp['dataset_dt'].tolist(),
    y = datasets_df_temp['datasets'].tolist()
    ))

fig.update_layout(
    title="Number of datasets in cellxgene over time",
    xaxis_title="date",
    yaxis_title="number of datasets"
)
             
fig.show()

In [106]:
cells_df_temp = cells_df
cells_df_temp = cells_df_temp.sort_values(by=['dataset_dt']).reset_index()
cells_df_temp['dataset_dt'] = cells_df_temp['dataset_dt'].dt.date
cells_df_temp = cells_df_temp[['dataset_dt', 'cell_count']].groupby(['dataset_dt']).sum().reset_index()
cells_df_temp['cells'] = cells_df_temp['cell_count'].cumsum()


fig = go.Figure(go.Scatter(
    mode = "lines+markers",
    x = cells_df_temp['dataset_dt'].tolist(),
    y = cells_df_temp['cells'].tolist()
    ))

fig.update_layout(
    title="Number of cells in cellxgene over time",
    xaxis_title="date",
    yaxis_title="number of cells"
)
             
fig.show()

## Number of datasets by characteristic

### Assay

In [107]:
df_assay = datasets_df[['dataset_dt', 'assay']].explode('assay').reset_index(drop = True)

# cleaning
df_assay['assay'] = df_assay['assay'].str.replace(' sequencing', '')
df_assay['assay'] = df_assay['assay'].str.replace('X', 'x')
df_assay['assay'] = df_assay['assay'].str.replace('Seq', 'seq')
df_assay['assay'] = df_assay['assay'].str.replace(' Genomics', '')
df_assay['assay'] = df_assay['assay'].str.replace(' Technology', '')
df_assay['assay'] = df_assay['assay'].str.replace(' technology', '')

df_assay['assay_grouped'] = None
df_assay.loc[df_assay['assay'].isin(['10x', '10x 3\' v2', '10x 3\' v3', '10x 5\' v1', '10x 5\' v3', '10x v2', '10x v3']), 'assay_grouped'] = '10x RNA-seq'
df_assay.loc[df_assay['assay'].isin(['Smart-seq', 'Smart-seq2', 'Smart-seq2 protocol']), 'assay_grouped'] = 'SS2'
df_assay.loc[df_assay['assay'].isin(['Drop-seq', 'microwell-seq', 'scRNA-seq', 'sci-RNA-seq', 'sci-plex', 'seq-Well']), 'assay_grouped'] = 'Other RNA-seq'
df_assay.loc[df_assay['assay'].isin(['ATC 10x v1', 'scATAC-seq']), 'assay_grouped'] = 'ATAC-seq'
df_assay.loc[df_assay['assay'].isin(['CITE-seq']), 'assay_grouped'] = 'CITE-seq'
df_assay.loc[df_assay['assay'].isin(['MERFISH', 'Visium Spatial Gene Expression']), 'assay_grouped'] = 'Spatial gene expression'

df_assay['assay'] = df_assay['assay_grouped']

In [108]:
barplot(df_assay, 'assay', 'count', False)

#### Number of datasets added using each assay type by month

In [109]:
assay_by_month = df_assay
assay_by_month['count'] = 1
assay_by_month['month'] = assay_by_month['dataset_dt'].dt.month
assay_by_month['year'] = assay_by_month['dataset_dt'].dt.year
assay_by_month = assay_by_month[['year', 'month', 'assay_grouped', 'count']].groupby(['year', 'month', 'assay_grouped']).sum().reset_index()
assay_by_month = assay_by_month.pivot(index = ['year', 'month'], columns = 'assay_grouped', values = 'count').reset_index()
assay_by_month


assay_grouped,year,month,10x RNA-seq,ATAC-seq,CITE-seq,Other RNA-seq,SS2,Spatial gene expression
0,2020,8,2.0,,,5.0,1.0,
1,2020,9,,,,2.0,,
2,2020,11,12.0,,,,,
3,2021,2,5.0,,,,,
4,2021,3,36.0,4.0,7.0,5.0,29.0,1.0
5,2021,4,20.0,,,9.0,,8.0
6,2021,5,8.0,,,,,
7,2021,6,6.0,,1.0,4.0,,
8,2021,7,32.0,,,,5.0,


In [110]:
cum_assay = df_assay
cum_assay['count'] = 1
cum_assay['date'] = cum_assay['dataset_dt'].dt.date
cum_assay = cum_assay[['date', 'assay_grouped', 'count']].groupby(['date', 'assay_grouped']).sum().reset_index()
cum_assay = cum_assay.pivot(index = ['date'], columns = 'assay_grouped', values = 'count').reset_index()

fig = go.Figure()

# add traces 
columns = ['10x RNA-seq', 'ATAC-seq', 'CITE-seq', 'Other RNA-seq', 'SS2', 'Spatial gene expression']
for col in columns:
    x_list, y_list = create_trace(cum_assay, col)
    add_trace(fig, x_list, y_list, col)


# create figure
fig.update_layout(
    title='Cumulative number of datasets by assay type',
    xaxis_title="date",
    yaxis_title="number of datasets",
    showlegend = False
)
    
fig.show()

### Disease

In [111]:
df_disease = datasets_df[['disease']].explode('disease').reset_index(drop = True)
df_disease['disease'] = df_disease['disease'].str.replace('Normal', 'normal')

barplot(df_disease, 'disease', 'count', False)

### Ethnicity

In [112]:
df_ethnicity = datasets_df[['ethnicity']].explode('ethnicity').reset_index(drop = True)
to_change = ['East Asian', 'Chinese', 'Han Asian']
for label in to_change:
    df_ethnicity['ethnicity'] = df_ethnicity['ethnicity'].str.replace(label, 'Asian')
df_ethnicity['ethnicity'] = df_ethnicity['ethnicity'].str.replace('Finnish', 'European')
df_ethnicity.loc[df_ethnicity['ethnicity']=='male', 'ethnicity'] = 'unknown'
df_ethnicity.loc[df_ethnicity['ethnicity']=='na', 'ethnicity'] = 'non-human'

barplot(df_ethnicity, 'ethnicity', 'count', False)


### Development stage

In [145]:
df_development_stage = datasets_df[['development_stage']].explode('development_stage').reset_index(drop = True)

df_development_stage['development_stage'] = df_development_stage['development_stage'].str.replace('human adult stage', 'adult')
df_development_stage['development_stage'] = df_development_stage['development_stage'].str.replace('developmental stage', 'unknown')
df_development_stage.loc[df_development_stage['development_stage'].str.contains('post-fertilization'), 'development_stage'] = 'fetal stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'[0-9]-year-old human stage'), 'development_stage'] = '<10 human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'1[0-9]-year-old human stage'), 'development_stage'] = '10s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'2[0-9]-year-old human stage'), 'development_stage'] = '20s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'3[0-9]-year-old human stage'), 'development_stage'] = '30s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'4[0-9]-year-old human stage'), 'development_stage'] = '40s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'5[0-9]-year-old human stage'), 'development_stage'] = '50s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'6[0-9]-year-old human stage'), 'development_stage'] = '60s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'7[0-9]-year-old human stage'), 'development_stage'] = '70s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'8[0-9]-year-old human stage'), 'development_stage'] = '80s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'9[0-9]-year-old human stage'), 'development_stage'] = '90s human stage'

df_development_stage.loc[df_development_stage['development_stage']=='embryonic human stage', 'order'] = 'ranka'
df_development_stage.loc[df_development_stage['development_stage']=='fetal stage', 'order'] = 'rankb'
df_development_stage.loc[df_development_stage['development_stage']=='newborn human stage', 'order'] = 'rankc'
df_development_stage.loc[df_development_stage['development_stage']=='infant stage', 'order'] = 'rankd'
df_development_stage.loc[df_development_stage['development_stage']=='child stage', 'order'] = 'ranke'
df_development_stage.loc[df_development_stage['development_stage']=='<10 human stage', 'order'] = 'rankf'
df_development_stage.loc[df_development_stage['development_stage']=='10s human stage', 'order'] = 'rankg'
df_development_stage.loc[df_development_stage['development_stage']=='20s human stage', 'order'] = 'rankh'
df_development_stage.loc[df_development_stage['development_stage']=='30s human stage', 'order'] = 'rankj'
df_development_stage.loc[df_development_stage['development_stage']=='40s human stage', 'order'] = 'rankk'
df_development_stage.loc[df_development_stage['development_stage']=='50s human stage', 'order'] = 'rankl'
df_development_stage.loc[df_development_stage['development_stage']=='60s human stage', 'order'] = 'rankm'
df_development_stage.loc[df_development_stage['development_stage']=='70s human stage', 'order'] = 'rankn'
df_development_stage.loc[df_development_stage['development_stage']=='80s human stage', 'order'] = 'ranko'
df_development_stage.loc[df_development_stage['development_stage']=='90s human stage', 'order'] = 'rankp'
df_development_stage.loc[df_development_stage['development_stage']=='human early adulthood stage', 'order'] = 'rankq'
df_development_stage.loc[df_development_stage['development_stage']=='human late adulthood stage', 'order'] = 'rankr'
df_development_stage.loc[df_development_stage['development_stage']=='adult', 'order'] = 'ranks'
df_development_stage.loc[df_development_stage['development_stage']=='unknown', 'order'] = 'rankt'

df_development_stage['count'] = 1
df_development_stage = df_development_stage.groupby(['development_stage', 'order']).sum().reset_index()


In [146]:
df_ds_nums = df_development_stage[df_development_stage['development_stage'].str.match(r'[<]*[0-9]{2}[s]* human stage')]
df_ds_nums = df_ds_nums.sort_values(by=['order'], ascending = True)

fig = go.Figure([go.Bar(x=df_ds_nums['development_stage'].tolist(), y=df_ds_nums['count'].tolist(), 
                        text=df_ds_nums['count'].tolist(), textposition='outside')])
    
    
title = 'Number of datasets by development stage (age)'

fig.update_layout(
    autosize=False,
    width=1000,
    height=750,
    title=title
)
fig.update_xaxes(
    tickangle = -60,
    title_text = 'development stage'
    )
    
fig.update_yaxes(
    title_text = "number of datasets"
    )
fig.show()

In [147]:
#df_ds_group = df_development_stage[df_development_stage['development_stage'].isin(['fetal stage', 'newborn human stage', 'infant stage', 'child stage', 'human early adulthood stage', 'human late adulthood stage', 'adult', 'unknown'])]
df_ds_group = df_development_stage[~df_development_stage['development_stage'].str.match(r'[<]*[0-9]{2}[s]* human stage')]
df_ds_group = df_ds_group.sort_values(by=['order'], ascending = True)

fig = go.Figure([go.Bar(x=df_ds_group['development_stage'].tolist(), y=df_ds_group['count'].tolist(), 
                        text=df_ds_group['count'].tolist(), textposition='outside')])
    
    
title = 'Number of datasets by development stage (group)'

fig.update_layout(
    autosize=False,
    width=1000,
    height=750,
    title=title
)
fig.update_xaxes(
    tickangle = -60,
    title_text = 'development stage'
    )
    
fig.update_yaxes(
    title_text = "number of datasets"
    )
fig.show()

### Tissue

In [117]:
df_tissues = datasets_df[['tissue']].explode('tissue').reset_index(drop = True)

df_tissues['tissue'] = df_tissues['tissue'].str.replace(' \(cell culture\)', '')
df_tissues['tissue'] = df_tissues['tissue'].str.replace('Blood', 'blood')
df_tissues['count'] = 1
df_tissues = df_tissues.groupby(['tissue']).sum().reset_index()
df_tissues = df_tissues.sort_values(by=['count'], ascending = False)
    
fig = go.Figure([go.Bar(x=df_tissues['tissue'].tolist(), y=df_tissues['count'].tolist())])
    
    
title = 'Number of datasets by tissue'
fig.update_layout(
    autosize=False,
    width=1000,
    height=750,
    title=title
)
fig.update_xaxes(
    tickangle = -60,
    title_text = 'tissues'
    )
    
fig.update_yaxes(
    title_text = "number of datasets"
    )
fig.show()


The default value of regex will change from True to False in a future version.



### Organism

In [118]:
plot_col_counts(datasets_df, 'organism')

### Sex

In [119]:
plot_col_counts(datasets_df, 'sex')