In [177]:
import pandas as pd
import requests
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime
import numpy as np

In [178]:
def get_collection_info(collection_id):
    collection_info_response = requests.get(f"https://api.cellxgene.cziscience.com/dp/v1/collections/{collection_id}")
    collection_info = collection_info_response.json()
    return collection_info

In [179]:
# get data 
response = requests.get("https://api.cellxgene.cziscience.com/dp/v1/collections")
collections = response.json()["collections"]

data = [get_collection_info(c["id"]) for c in collections]

In [180]:
# create collection table 
collection_id = []
collection_dt = []
collection_name = []

for collection in data:
    collection_id.append(collection['id'])
    collection_dt.append(collection['created_at'])
    collection_name.append(collection['name'])

collections_df = pd.DataFrame({'collection_id': collection_id, 'collection_dt': collection_dt, 'collection_name': collection_name})
collections_df['collection_dt'] = pd.to_datetime(collections_df['collection_dt'] , unit='s')

In [181]:
# create dataset table 
keys_w_label = ['assay', 'development_stage', 'disease', 'ethnicity', 'tissue']
keys_wo_label = ['collection_id', 'id', 'name', 'sex', 'cell_count', 'created_at']
dataset_data = []

for c in data:
    for d in c['datasets']:
        keys = d.keys()
        keys_wo_labels_new = list(set(keys) & set(keys_wo_label))
        dict_updated = {key: d[key] for key in keys_wo_labels_new}
        keys_w_labels_new = list(set(keys) & set(keys_w_label))
        for key in keys_w_labels_new:
            dict_temp = {key: [i['label'] for i in d[key]]}
            dict_updated.update(dict_temp)
        dict_updated.update({'organism': d['organism']['label']})
        dataset_data.append(dict_updated)

datasets_df = pd.DataFrame(dataset_data)
datasets_df = datasets_df.rename(columns = {'id': 'dataset_id', 'name': 'dataset_name', 'created_at': 'dataset_dt'})
datasets_df['dataset_dt'] = pd.to_datetime(datasets_df['dataset_dt'], unit='s')

In [182]:
dataset_id_to_name = datasets_df[['dataset_name', 'dataset_id', 'dataset_dt']]

In [183]:
# create key to dataset_id translation table
dataset_id = []
s3_uri = []
for c in data:
    for d in c['datasets']:
        for a in d['dataset_assets']:
            dataset_id.append(a['dataset_id'])
            s3_uri.append(a['s3_uri'])

id_to_s3 = pd.DataFrame({'dataset_id': dataset_id, 'key': s3_uri})
id_to_s3['key'] = id_to_s3['key'].str.replace('s3://corpora-data-prod/', '')
id_to_s3['key'] = id_to_s3['key'].str.replace('s3://hosted-cellxgene-prod/', '')

In [184]:
# downloads data
downloads_df = pd.read_csv('download_data.csv')

# remove downloads with bytessent == 1 
downloads_df = downloads_df[downloads_df['bytessent'] > 1].reset_index(drop = True)

# remove downloads from myself(ip address = 71.200.124.31)
downloads_df = downloads_df[downloads_df['remoteip'] != '71.200.124.31']

# process datetime 
downloads_df['download_datetime'] = pd.to_datetime(downloads_df['download_datetime'], format='%d/%b/%Y:%H:%M:%S %z')
downloads_df['download_dt'] = downloads_df['download_datetime'].dt.date

# merge to dataset_id
downloads_df = downloads_df.merge(id_to_s3, how = 'inner', on = 'key')

downloads_df = downloads_df[['dataset_id', 'download_datetime', 'download_dt', 'filetype', 'remoteip', 'download_agent']].reset_index(drop = True)

downloads_df['total_downloads'] = 1

downloads_df = downloads_df.groupby(['dataset_id', 'download_dt', 'filetype', 'remoteip', 'download_agent']).sum().reset_index()

In [185]:
# limit to downloads of datasets on the platform 
combined = dataset_id_to_name.merge(downloads_df, how = 'inner', on = 'dataset_id')
combined['count'] = 1

In [186]:
combined

Unnamed: 0,dataset_name,dataset_id,dataset_dt,download_dt,filetype,remoteip,download_agent,total_downloads,count
0,A Single-Cell Transcriptome Atlas of the Human...,b07e5164-baf6-43d2-bdba-5a249d0da879,2021-05-05 15:04:27.221111040,2021-05-05,h5ad,50.18.239.242,curl,1,1
1,A Single-Cell Transcriptome Atlas of the Human...,b07e5164-baf6-43d2-bdba-5a249d0da879,2021-05-05 15:04:27.221111040,2021-05-13,h5ad,193.62.202.235,curl,1,1
2,A Single-Cell Transcriptome Atlas of the Human...,b07e5164-baf6-43d2-bdba-5a249d0da879,2021-05-05 15:04:27.221111040,2021-05-20,h5ad,202.64.1.174,Win64,1,1
3,A Single-Cell Transcriptome Atlas of the Human...,b07e5164-baf6-43d2-bdba-5a249d0da879,2021-05-05 15:04:27.221111040,2021-05-24,h5ad,50.18.239.242,curl,2,1
4,A Single-Cell Transcriptome Atlas of the Human...,b07e5164-baf6-43d2-bdba-5a249d0da879,2021-05-05 15:04:27.221111040,2021-05-25,h5ad,146.107.103.119,curl,1,1
...,...,...,...,...,...,...,...,...,...
2335,Single cell analysis of mouse and human prosta...,574e9f9e-f8b4-41ef-bf19-89a9964fd9c7,2021-07-27 14:05:30.625396992,2021-08-19,h5ad,86.247.212.62,Ubuntu,2,1
2336,Single cell analysis of mouse and human prosta...,574e9f9e-f8b4-41ef-bf19-89a9964fd9c7,2021-07-27 14:05:30.625396992,2021-08-20,h5ad,86.247.212.62,Ubuntu,1,1
2337,Single cell analysis of mouse and human prosta...,574e9f9e-f8b4-41ef-bf19-89a9964fd9c7,2021-07-27 14:05:30.625396992,2021-08-21,h5ad,35.231.193.255,requests-python,1,1
2338,Single cell analysis of mouse and human prosta...,5ba85070-a41c-4184-9c18-cf34c3fd0f62,2021-07-27 14:06:02.273560064,2021-07-30,h5ad,149.165.234.129,requests-python,1,1


# Single cell download metrics

## Downloads over time


In [187]:
downloads = combined[['download_dt', 'remoteip', 'dataset_id']].drop_duplicates().reset_index(drop = True)
downloads['count'] = 1
downloads['download_dt'] = pd.to_datetime(downloads['download_dt'])

##### Total number of downloads

In [188]:
total_downloads = downloads['count'].sum()
print("{:,}".format(total_downloads))

2,207


#####  Downloads by month

In [189]:
downloads_by_month = downloads[['download_dt', 'count']]
downloads_by_month['month'] = downloads_by_month['download_dt'].dt.month
downloads_by_month['year'] = downloads_by_month['download_dt'].dt.year
downloads_by_month.groupby(['month', 'year']).sum().reset_index()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,month,year,count
0,4,2021,59
1,5,2021,426
2,6,2021,730
3,7,2021,710
4,8,2021,282


In [190]:
downloads_over_time = downloads[['download_dt', 'count']].groupby(['download_dt']).sum().reset_index()
downloads_over_time['cum_downloads'] = downloads_over_time['count'].cumsum()

fig = go.Figure(go.Scatter(
    mode = "lines+markers",
    x = downloads_over_time['download_dt'].tolist(),
    y = downloads_over_time['cum_downloads'].tolist()
    ))

fig.update_layout(
    title="Number of dataset downloads over time",
    xaxis_title="date",
    yaxis_title="number of downloads"
)
             
fig.show()

## Most downloaded datasets

### Raw downloads

In [191]:
top_ds_raw = combined[['dataset_id', 'dataset_name','download_dt','remoteip']].drop_duplicates()
top_ds_raw['raw downloads'] = 1
top_ds_raw = top_ds_raw[['dataset_id', 'dataset_name', 'raw downloads']].groupby(['dataset_id', 'dataset_name']).sum().reset_index()
top_ds_raw = top_ds_raw.sort_values(by=['raw downloads'], ascending = False).reset_index(drop = True)
top_ds_raw[0:10]


Unnamed: 0,dataset_id,dataset_name,raw downloads
0,f72958f5-7f42-4ebb-98da-445b0c6de516,Azimuth meta-analysis of 10 datasets of health...,70
1,66d15835-5dc8-4e96-b0eb-f48971cb65e8,Single cell transcriptome analysis of human pa...,66
2,9df60c57-fdf3-4e93-828e-fe9303f20438,Single cell transcriptional and chromatin acce...,61
3,21d3e683-80a4-4d9b-bc89-ebb2df513dde,Time-resolved Systems Immunology Reveals a Lat...,56
4,8c42cfd0-0b0a-46d5-910c-fc833d83c45e,"Krasnow Lab Human Lung Cell Atlas, 10X",53
5,13a027de-ea3e-432b-9a5e-6bc7048498fc,Single cell transcriptional and chromatin acce...,48
6,5a11f879-d1ef-458a-910c-9b0bdfca5ebf,Tabula Sapiens - Endothelial,42
7,b83559d1-156f-4ba9-9f6a-b165f83ef43f,Single-cell RNA-Seq Investigation of Foveal an...,37
8,53d208b0-2cfd-4366-9866-c3c6114081bc,Tabula Sapiens - All Cells,36
9,b07e5164-baf6-43d2-bdba-5a249d0da879,A Single-Cell Transcriptome Atlas of the Human...,35


In [192]:
first_ds_raw = combined[['dataset_id', 'dataset_name','download_dt','remoteip']].drop_duplicates()
first_ds_raw = first_ds_raw.sort_values(['remoteip', 'dataset_id', 'download_dt']).drop_duplicates(subset=['remoteip', 'dataset_id'], keep = 'first').reset_index(drop = True)
first_ds_raw['raw downloads'] = 1
first_ds_raw = first_ds_raw.groupby(['dataset_id', 'dataset_name']).sum().reset_index()
first_ds_raw = first_ds_raw.sort_values(by=['raw downloads'], ascending = False).reset_index(drop = True)
first_ds_raw[0:10]

Unnamed: 0,dataset_id,dataset_name,raw downloads
0,f72958f5-7f42-4ebb-98da-445b0c6de516,Azimuth meta-analysis of 10 datasets of health...,59
1,21d3e683-80a4-4d9b-bc89-ebb2df513dde,Time-resolved Systems Immunology Reveals a Lat...,50
2,9df60c57-fdf3-4e93-828e-fe9303f20438,Single cell transcriptional and chromatin acce...,48
3,66d15835-5dc8-4e96-b0eb-f48971cb65e8,Single cell transcriptome analysis of human pa...,42
4,5a11f879-d1ef-458a-910c-9b0bdfca5ebf,Tabula Sapiens - Endothelial,40
5,13a027de-ea3e-432b-9a5e-6bc7048498fc,Single cell transcriptional and chromatin acce...,38
6,8c42cfd0-0b0a-46d5-910c-fc833d83c45e,"Krasnow Lab Human Lung Cell Atlas, 10X",38
7,53d208b0-2cfd-4366-9866-c3c6114081bc,Tabula Sapiens - All Cells,34
8,30cd5311-6c09-46c9-94f1-71fe4b91813c,Time-resolved Systems Immunology Reveals a Lat...,29
9,b83559d1-156f-4ba9-9f6a-b165f83ef43f,Single-cell RNA-Seq Investigation of Foveal an...,27


### Normalized

In [193]:
top_ds_norm = combined[['dataset_id', 'dataset_name','download_dt','remoteip']].drop_duplicates()
top_ds_norm['raw downloads'] = 1
top_ds_norm = top_ds_norm.groupby(['dataset_id', 'dataset_name']).sum().reset_index()
top_ds_norm = top_ds_norm.merge(datasets_df[['dataset_id', 'dataset_dt']], how = 'inner', on = 'dataset_id')

top_ds_norm['date_since'] = top_ds_norm['dataset_dt']
top_ds_norm.loc[top_ds_norm['dataset_dt'] < '2021-04-27', 'date_since'] = datetime.datetime(2021, 4, 27)
top_ds_norm['date_since'] = top_ds_norm['date_since'].dt.date

top_ds_norm['days_on_platform'] = datetime.date.today() - top_ds_norm['date_since']
top_ds_norm['days_on_platform'] = top_ds_norm['days_on_platform'].dt.days
top_ds_norm['value'] = ((top_ds_norm['raw downloads']/top_ds_norm['days_on_platform'])*100).astype(int)

top_ds_norm = top_ds_norm[['dataset_id', 'dataset_name', 'raw downloads', 'value']].sort_values(by=['value'], ascending = False).reset_index(drop = True)
top_ds_norm[0:10]

Unnamed: 0,dataset_id,dataset_name,raw downloads,value
0,5a11f879-d1ef-458a-910c-9b0bdfca5ebf,Tabula Sapiens - Endothelial,42,107
1,53d208b0-2cfd-4366-9866-c3c6114081bc,Tabula Sapiens - All Cells,36,94
2,f72958f5-7f42-4ebb-98da-445b0c6de516,Azimuth meta-analysis of 10 datasets of health...,70,89
3,c5d88abe-f23a-45fa-a534-788985e93dad,Tabula Sapiens - Immune,30,76
4,66d15835-5dc8-4e96-b0eb-f48971cb65e8,Single cell transcriptome analysis of human pa...,66,66
5,9df60c57-fdf3-4e93-828e-fe9303f20438,Single cell transcriptional and chromatin acce...,61,50
6,21d3e683-80a4-4d9b-bc89-ebb2df513dde,Time-resolved Systems Immunology Reveals a Lat...,56,46
7,8c42cfd0-0b0a-46d5-910c-fc833d83c45e,"Krasnow Lab Human Lung Cell Atlas, 10X",53,43
8,a1b9c51e-a408-4f7f-bccb-abefe20ae2a5,Olah et al (2020) Single-cell Human Microglia,15,41
9,13a027de-ea3e-432b-9a5e-6bc7048498fc,Single cell transcriptional and chromatin acce...,48,39


## Download pattern of datasets

In [194]:
def create_trace_cum_daily(dataset_id):
    output = downloads_df[downloads_df['dataset_id'] == dataset_id]
    output = output[['download_dt', 'remoteip']].drop_duplicates()
    output['count'] = 1
    output = output[['download_dt', 'count']].groupby(['download_dt']).sum().reset_index()
    output['cum_downloads'] = output['count'].cumsum()
    return (output['download_dt'].tolist(), output['cum_downloads'].tolist())

def add_trace(fig, x_list, y_list, label):
    fig.add_trace(go.Scatter(x=x_list, y=y_list,
                    mode='lines+markers', 
                    hovertemplate = 
                            '<i>Date</i>: %{x}'+
                            '<br>Downloads</i>: %{y}<br>'+
                            '<br>%{text}</i>', text = [label for i in range(len(x_list))]))

    
def dataset_download_trends(d_ids, d_names, title):
    x_lists = []
    y_lists = []
    for id in d_ids:
        x, y = create_trace_cum_daily(id)

        x_lists.append(x)
        y_lists.append(y)
    
    fig = go.Figure()

    for i in range(len(d_ids)):
        add_trace(fig, x_lists[i], y_lists[i], d_names[i])

    fig.update_layout(
        title=title,
        xaxis_title="date",
        yaxis_title="number of downloads",
        showlegend = False
    )
    
    fig.show()

### Raw downloads
* Version 1: daily unique dataset downloads by remoteip
* Version 2: first download of each dataset by remoteip

#### Version 1
* Actively downloaded: downloaded 25 times or more
* Moderately downloaded: downloaded 10 to 25 times
* Lightly downloaded: downloaded less than 10 times

##### Evaluate download groups

In [195]:
# evaluate how to define download groups 
x = top_ds_raw['raw downloads'].tolist()
fig = go.Figure(data=[go.Histogram(x=x)])
fig.update_layout(
        xaxis_title="number of downloads",
        yaxis_title="raw downloads",
        showlegend = False
    )
fig.show()

##### Actively downloaded

In [196]:
top10_raw = top_ds_raw[0:10]
active_raw_ids = top10_raw['dataset_id'].tolist()
active_raw_names = top10_raw['dataset_name'].tolist()

dataset_download_trends(active_raw_ids, active_raw_names, 'Trends of top 10 actively downloaded datasets (cumulative) - raw downloads')


##### Moderately downloaded

In [197]:
top10_raw = top_ds_raw[top_ds_raw['raw downloads'] <25][0:10]
active_raw_ids = top10_raw['dataset_id'].tolist()
active_raw_names = top10_raw['dataset_name'].tolist()

dataset_download_trends(active_raw_ids, active_raw_names, 'Trends of top 10 moderately downloaded datasets (cumulative) - raw downloads')


##### Lightly downloaded

In [198]:
top10_raw = top_ds_raw[top_ds_raw['raw downloads'] <10][0:10]
active_raw_ids = top10_raw['dataset_id'].tolist()
active_raw_names = top10_raw['dataset_name'].tolist()

dataset_download_trends(active_raw_ids, active_raw_names, 'Trends of top 10 lightly downloaded datasets (cumulative) - raw downloads')


#### Version 2
* Actively downloaded: downloaded 9 times or more
* Moderately downloaded: downloaded 5 to 8 times
* Lightly downloaded: downloaded less than 5 times

In [199]:
#len(top_ds_norm[(top_ds_norm['value'] > 4) & (top_ds_norm['value'] < 9)]['dataset_id']) (52)
#len(first_ds_raw[(first_ds_raw['raw downloads'] < 5)]['dataset_id']) (56)
len(first_ds_raw[(first_ds_raw['raw downloads'] > 8)]['dataset_id']) (49)

TypeError: 'int' object is not callable

##### Evaluate download groups

In [None]:
# evaluate how to define download groups 
x = first_ds_raw['raw downloads'].tolist()
fig = go.Figure(data=[go.Histogram(x=x)])
fig.update_layout(
        xaxis_title="number of downloads",
        yaxis_title="raw downloads",
        showlegend = False
    )
fig.show()

##### Actively downloaded

In [None]:
top_10_first_raw = first_ds_raw[0:10]
active_raw_ids = top_10_first_raw['dataset_id'].tolist()
active_raw_names = top_10_first_raw['dataset_name'].tolist()

dataset_download_trends(active_raw_ids, active_raw_names, 'Trends of top 10 actively downloaded datasets (cumulative) - raw first downloads')


##### Moderately downloaded

##### Lightly downloaded

### Normalized downloads
* Actively downloaded: downloaded more than 10 times
* Moderately downloaded: downloaded 6 to 10 times
* Lightly downloaded: downloaded 5 or less than times

#### Evaluate download groups

In [None]:
# evaluate how to define download groups 
x = top_ds_norm['value'].tolist()
fig = go.Figure(data=[go.Histogram(x=x)])
fig.update_layout(
        xaxis_title="normalized value",
        yaxis_title="count",
        showlegend = False
    )
fig.show()

##### Actively downloaded

In [None]:
top10_norm = top_ds_norm[0:10]
active_norm_ids = top10_norm['dataset_id'].tolist()
active_norm_names = top10_norm['dataset_name'].tolist()

dataset_download_trends(active_norm_ids, active_norm_names, 'Trends of top 10 actively downloaded datasets (cumulative) - normalized')


##### Moderately downloaded

In [None]:
top10_norm = top_ds_norm[top_ds_norm['value'] < 11][0:10]
active_norm_ids = top10_norm['dataset_id'].tolist()
active_norm_names = top10_norm['dataset_name'].tolist()

dataset_download_trends(active_norm_ids, active_norm_names, 'Trends of top 10 moderately downloaded datasets (cumulative) - normalized')


##### Lightly downloaded

In [None]:
top10_norm = top_ds_norm[top_ds_norm['value'] < 6][0:10]
active_norm_ids = top10_norm['dataset_id'].tolist()
active_norm_names = top10_norm['dataset_name'].tolist()

dataset_download_trends(active_norm_ids, active_norm_names, 'Trends of top 10 moderately downloaded datasets (cumulative) - normalized')


## Most active users

#### Number of unique users

In [None]:
len(combined['remoteip'].drop_duplicates().tolist())

In [None]:
users = combined[['dataset_id', 'dataset_name','download_dt','remoteip']].drop_duplicates()
users['count'] = 1
users = users[['remoteip', 'count']].groupby(['remoteip']).sum().reset_index()
users= users.sort_values(by=['count'], ascending = False).reset_index(drop = True)
users['total'] = users['count'].sum()
users['perc'] = round(users['count']/users['total'], 2)
users[['remoteip', 'count', 'perc']][:10]

## Downloads by dataset characteristic (WIP)

In [None]:
def calc_normalized(df, col):
    df['total'] = 1
    df = df.groupby([col]).sum().reset_index()
    df['normalized'] = (df['value']/df['total']).astype(int)
    df = df.rename(columns={'count': 'raw downloads'})
    df['raw downloads'] = df['raw downloads'].astype(int)
    return df

def multi_bar_plots(df, col, title):
    fig = make_subplots(rows=1, cols=2)

    # add traces
    fig.add_trace(
        go.Bar(name = 'normalized', x=df[col].tolist(), y=df['normalized'].tolist(), 
                       text=df['normalized'].tolist(), textposition='outside', 
                       hovertemplate = 
                            'Category: %{x}'+
                            '<br>Value: %{y}<br>'),
        row = 1, col = 1
        )

    fig.add_trace(
        go.Bar(name = 'raw downloads', x=df[col].tolist(), y=df['raw downloads'].tolist(), 
                      text=df['raw downloads'].tolist(), textposition='outside',
                      hovertemplate = 
                            'Category: %{x}'+
                            '<br>Downloads: %{y}<br>'),
        row = 1, col = 2
        )

    # update xaxis properties
    fig.update_xaxes(title_text=col, row = 1, col = 1, tickangle = 35)
    fig.update_xaxes(title_text=col, row = 1, col = 2, tickangle = 35)

    # update yaxis properties
    fig.update_yaxes(title_text='value', row = 1, col = 1)
    fig.update_yaxes(title_text='raw downloads', row = 1, col = 2)


    fig.update_layout(
        title=title,
        legend=dict(
            x = 0.35, y = -0.4,
            orientation = 'h'
        ),
        autosize=False,
        width=1000,
        height=750
    )
    fig.show()

def create_multi_barplots(df, col):
    output = calc_normalized(df, col)
    
    output = output.sort_values(by=['raw downloads'], ascending = False)
    title = 'Downloads by ' + col
    
    multi_bar_plots(output, col, title)

In [None]:
df_char = downloads.groupby(['dataset_id']).sum().reset_index()
df_char = datasets_df.merge(df_char, how = 'left', on = 'dataset_id')

df_char['date_since'] = df_char['dataset_dt']
df_char.loc[df_char['dataset_dt'] < '2021-04-22', 'date_since'] = datetime.datetime(2021, 4, 27)
df_char['date_since'] = df_char['date_since'].dt.date

df_char['days_on_platform'] = datetime.date.today() - df_char['date_since']
df_char['days_on_platform'] = df_char['days_on_platform'].dt.days
df_char['value'] = (df_char['count']/df_char['days_on_platform'])*100


In [None]:
# assay
df_assay = df_char[['assay', 'count', 'value']].explode('assay').reset_index(drop = True) 
df_assay['assay'] = df_assay['assay'].str.replace(' sequencing', '')
df_assay['assay'] = df_assay['assay'].str.replace('X', 'x')
df_assay['assay'] = df_assay['assay'].str.replace('Seq', 'seq')
df_assay['assay'] = df_assay['assay'].str.replace(' Genomics', '')
df_assay['assay'] = df_assay['assay'].str.replace(' Technology', '')
df_assay['assay'] = df_assay['assay'].str.replace(' technology', '')

df_assay['assay_grouped'] = None
df_assay.loc[df_assay['assay'].isin(['10x', '10x 3\' v2', '10x 3\' v3', '10x 5\' v1', '10x 5\' v3', '10x v2', '10x v3']), 'assay_grouped'] = '10x RNA-seq'
df_assay.loc[df_assay['assay'].isin(['Smart-seq', 'Smart-seq2', 'Smart-seq2 protocol']), 'assay_grouped'] = 'SS2'
df_assay.loc[df_assay['assay'].isin(['Drop-seq', 'microwell-seq', 'scRNA-seq', 'sci-RNA-seq', 'sci-plex', 'seq-Well']), 'assay_grouped'] = 'Other RNA-seq'
df_assay.loc[df_assay['assay'].isin(['ATAC 10x v1', 'scATAC-seq']), 'assay_grouped'] = 'ATAC-seq'
df_assay.loc[df_assay['assay'].isin(['CITE-seq']), 'assay_grouped'] = 'CITE-seq'
df_assay.loc[df_assay['assay'].isin(['MERFISH', 'Visium Spatial Gene Expression']), 'assay_grouped'] = 'Spatial gene expression'

df_assay['assay'] = df_assay['assay_grouped']
create_multi_barplots(df_assay, 'assay')

In [None]:
# disease
df_disease = df_char[['disease', 'count', 'value']].explode('disease').reset_index(drop = True) 
df_disease['disease'] = df_disease['disease'].str.replace('Normal', 'normal')

create_multi_barplots(df_disease, 'disease')

In [None]:
# ethnicity
df_ethnicity = df_char[['ethnicity', 'count', 'value']].explode('ethnicity').reset_index(drop = True) 
to_change = ['East Asian', 'Chinese', 'Han Asian']
for label in to_change:
    df_ethnicity['ethnicity'] = df_ethnicity['ethnicity'].str.replace(label, 'Asian')
    df_ethnicity['ethnicity'] = df_ethnicity['ethnicity'].str.replace('Finnish', 'European')
    df_ethnicity.loc[df_ethnicity['ethnicity']=='male', 'ethnicity'] = 'unknown'
    df_ethnicity.loc[df_ethnicity['ethnicity']=='na', 'ethnicity'] = 'non-human'

create_multi_barplots(df_ethnicity, 'ethnicity')

In [None]:
# development stage
df_development_stage = df_char[['development_stage', 'count', 'value']].explode('development_stage').reset_index(drop = True)

df_development_stage['development_stage'] = df_development_stage['development_stage'].str.replace('human adult stage', 'adult')
df_development_stage['development_stage'] = df_development_stage['development_stage'].str.replace('developmental stage', 'unknown')
df_development_stage.loc[df_development_stage['development_stage'].str.contains('post-fertilization'), 'development_stage'] = 'fetal stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'[0-9]-year-old human stage'), 'development_stage'] = '<10 human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'1[0-9]-year-old human stage'), 'development_stage'] = '10s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'2[0-9]-year-old human stage'), 'development_stage'] = '20s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'3[0-9]-year-old human stage'), 'development_stage'] = '30s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'4[0-9]-year-old human stage'), 'development_stage'] = '40s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'5[0-9]-year-old human stage'), 'development_stage'] = '50s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'6[0-9]-year-old human stage'), 'development_stage'] = '60s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'7[0-9]-year-old human stage'), 'development_stage'] = '70s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'8[0-9]-year-old human stage'), 'development_stage'] = '80s human stage'
df_development_stage.loc[df_development_stage['development_stage'].str.match(r'9[0-9]-year-old human stage'), 'development_stage'] = '90s human stage'

df_development_stage.loc[df_development_stage['development_stage']=='embryonic human stage', 'order'] = 'ranka'
df_development_stage.loc[df_development_stage['development_stage']=='fetal stage', 'order'] = 'rankb'
df_development_stage.loc[df_development_stage['development_stage']=='newborn human stage', 'order'] = 'rankc'
df_development_stage.loc[df_development_stage['development_stage']=='infant stage', 'order'] = 'rankd'
df_development_stage.loc[df_development_stage['development_stage']=='child stage', 'order'] = 'ranke'
df_development_stage.loc[df_development_stage['development_stage']=='<10 human stage', 'order'] = 'rankf'
df_development_stage.loc[df_development_stage['development_stage']=='10s human stage', 'order'] = 'rankg'
df_development_stage.loc[df_development_stage['development_stage']=='20s human stage', 'order'] = 'rankh'
df_development_stage.loc[df_development_stage['development_stage']=='30s human stage', 'order'] = 'rankj'
df_development_stage.loc[df_development_stage['development_stage']=='40s human stage', 'order'] = 'rankk'
df_development_stage.loc[df_development_stage['development_stage']=='50s human stage', 'order'] = 'rankl'
df_development_stage.loc[df_development_stage['development_stage']=='60s human stage', 'order'] = 'rankm'
df_development_stage.loc[df_development_stage['development_stage']=='70s human stage', 'order'] = 'rankn'
df_development_stage.loc[df_development_stage['development_stage']=='80s human stage', 'order'] = 'ranko'
df_development_stage.loc[df_development_stage['development_stage']=='90s human stage', 'order'] = 'rankp'
df_development_stage.loc[df_development_stage['development_stage']=='human early adulthood stage', 'order'] = 'rankq'
df_development_stage.loc[df_development_stage['development_stage']=='human late adulthood stage', 'order'] = 'rankr'
df_development_stage.loc[df_development_stage['development_stage']=='adult', 'order'] = 'ranks'
df_development_stage.loc[df_development_stage['development_stage']=='unknown', 'order'] = 'rankt'

df_development_stage['total'] = 1
df_development_stage = df_development_stage.groupby(['development_stage', 'order']).sum().reset_index()
df_development_stage['normalized'] = (df_development_stage['value']/df_development_stage['total']).astype(int)
df_development_stage = df_development_stage.rename(columns={'count': 'raw downloads'})
df_development_stage['raw downloads'] = df_development_stage['raw downloads'].astype(int)

In [None]:
df_ds_nums = df_development_stage[df_development_stage['development_stage'].str.match(r'[<]*[0-9]{2}[s]* human stage')]
df_ds_nums = df_ds_nums.sort_values(by=['order'], ascending = True)

multi_bar_plots(df_ds_nums, 'development_stage', 'Downloads by development stage (age)')

In [None]:
df_ds_group = df_development_stage[~df_development_stage['development_stage'].str.match(r'[<]*[0-9]{2}[s]* human stage')]
df_ds_group = df_ds_group.sort_values(by=['order'], ascending = True)

multi_bar_plots(df_ds_group, 'development_stage', 'Downloads by development stage (group)')

In [None]:
# tissue
df_tissue = df_char[['tissue', 'count', 'value']].explode('tissue').reset_index(drop = True) 

df_tissue['tissue'] = df_tissue['tissue'].str.replace(' \(cell culture\)', '')
df_tissue['tissue'] = df_tissue['tissue'].str.replace('Blood', 'blood')

tissue_output = calc_normalized(df_tissue, 'tissue')



In [None]:
tissue_output = tissue_output.sort_values(by=['raw downloads'], ascending = False)
fig = go.Figure([go.Bar(name = 'normalized', x=tissue_output['tissue'].tolist(), 
                        y=tissue_output['normalized'].tolist(),
                        hovertemplate = 
                            'Category: %{x}'+
                            '<br>Value: %{y}<br>')])

fig.update_xaxes(title_text='tissue', tickangle = 60)

fig.update_yaxes(title_text='value')

fig.update_layout(
    title='Downloads by tissue',
    legend=dict(
        x = 0.35, y = -0.4,
        orientation = 'h'
    ),
    autosize=False,
    width=1000,
    height=750,
    showlegend = True
)
fig.show()


In [None]:
tissue_output = tissue_output.sort_values(by=['raw downloads'], ascending = False)
fig = go.Figure([go.Bar(name = 'raw downloads', x=tissue_output['tissue'].tolist(), 
                        y=tissue_output['raw downloads'].tolist(), 
                        hovertemplate = 
                            'Category: %{x}'+
                            '<br>Downloads: %{y}<br>',
                        marker_color='rgb(239, 50, 50)')])

fig.update_xaxes(title_text='tissue', tickangle = 60)

fig.update_yaxes(title_text='raw downloads')

fig.update_layout(
    title='Downloads by tissue',
    legend=dict(
        x = 0.35, y = -0.4,
        orientation = 'h'
    ),
    autosize=False,
    width=1000,
    height=750,
    showlegend = True
)
fig.show()

In [None]:
# organism
df_organism = df_char[['organism', 'count', 'value']].explode('organism').reset_index(drop = True) 

create_multi_barplots(df_organism, 'organism')

In [None]:
# sex
df_sex = df_char[['sex', 'count', 'value']].explode('sex').reset_index(drop = True) 

create_multi_barplots(df_sex, 'sex')