In [146]:
import pandas as pd
import requests
import plotly.graph_objects as go
import plotly.express as px
import datetime
import numpy as np

In [2]:
def get_collection_info(collection_id):
    collection_info_response = requests.get(f"https://api.cellxgene.cziscience.com/dp/v1/collections/{collection_id}")
    collection_info = collection_info_response.json()
    return collection_info

In [3]:
# get data 
response = requests.get("https://api.cellxgene.cziscience.com/dp/v1/collections")
collections = response.json()["collections"]

data = [get_collection_info(c["id"]) for c in collections]

In [4]:
# create collection table 
collection_id = []
collection_dt = []
collection_name = []

for collection in data:
    collection_id.append(collection['id'])
    collection_dt.append(collection['created_at'])
    collection_name.append(collection['name'])

collections_df = pd.DataFrame({'collection_id': collection_id, 'collection_dt': collection_dt, 'collection_name': collection_name})
collections_df['collection_dt'] = pd.to_datetime(collections_df['collection_dt'] , unit='s')

In [5]:
# create dataset table 
keys_w_label = ['assay', 'development_stage', 'disease', 'ethnicity', 'tissue']
keys_wo_label = ['collection_id', 'id', 'name', 'sex', 'cell_count', 'created_at']
dataset_data = []

for c in data:
    for d in c['datasets']:
        keys = d.keys()
        keys_wo_labels_new = list(set(keys) & set(keys_wo_label))
        dict_updated = {key: d[key] for key in keys_wo_labels_new}
        keys_w_labels_new = list(set(keys) & set(keys_w_label))
        for key in keys_w_labels_new:
            dict_temp = {key: [i['label'] for i in d[key]]}
            dict_updated.update(dict_temp)
        dict_updated.update({'organism': d['organism']['label']})
        dataset_data.append(dict_updated)

datasets_df = pd.DataFrame(dataset_data)
datasets_df = datasets_df.rename(columns = {'id': 'dataset_id', 'name': 'dataset_name', 'created_at': 'dataset_dt'})
datasets_df['dataset_dt'] = pd.to_datetime(datasets_df['dataset_dt'], unit='s')

In [6]:
dataset_id_to_name = datasets_df[['dataset_name', 'dataset_id', 'dataset_dt']]

In [7]:
# downloads data
downloads_df = pd.read_csv('download_data.csv')

# remove downloads from datasets no longer on the platform
downloads_df = downloads_df[downloads_df['filetype'].notnull()].reset_index()

# remove downloads with bytessent == 1 
downloads_df = downloads_df[downloads_df['bytessent'] > 1].reset_index()

# remove downloads from myself(ip address = 71.200.124.31)
downloads_df = downloads_df[downloads_df['remoteip'] != '71.200.124.31']

# process datetime 
downloads_df['download_datetime'] = pd.to_datetime(downloads_df['download_datetime'], format='%d/%b/%Y:%H:%M:%S %z')
downloads_df['download_dt'] = downloads_df['download_datetime'].dt.date

downloads_df = downloads_df[['dataset_id', 'download_datetime', 'download_dt', 'filetype', 'remoteip', 'download_agent']].reset_index(drop = True)

downloads_df['total_downloads'] = 1

downloads_df = downloads_df.groupby(['dataset_id', 'download_dt', 'filetype', 'remoteip', 'download_agent']).sum().reset_index()


In [8]:
# limit to downloads of datasets on the platform 
combined = dataset_id_to_name.merge(downloads_df, how = 'inner', on = 'dataset_id')
combined['count'] = 1

# Single cell download metrics

## Downloads over time


In [84]:
downloads = combined[['download_dt', 'remoteip', 'dataset_id']].drop_duplicates()
downloads['count'] = 1
downloads['download_dt'] = pd.to_datetime(downloads['download_dt'])

##### Total number of downloads

In [85]:
total_downloads = downloads['count'].sum()
print("{:,}".format(total_downloads))

1,753


#####  Downloads by month

In [86]:
downloads_by_month['month'] = downloads_by_month['download_dt'].dt.month
downloads_by_month['year'] = downloads_by_month['download_dt'].dt.year
downloads_by_month.groupby(['month', 'year']).sum().reset_index()

Unnamed: 0,month,year,count
0,4,2021,51
1,5,2021,378
2,6,2021,560
3,7,2021,636
4,8,2021,128


In [87]:
downloads_over_time = downloads[['download_dt', 'count']].groupby(['download_dt']).sum().reset_index()
downloads_over_time['cum_downloads'] = downloads_over_time['count'].cumsum()

fig = go.Figure(go.Scatter(
    mode = "lines+markers",
    x = downloads_over_time['download_dt'].tolist(),
    y = downloads_over_time['cum_downloads'].tolist()
    ))

fig.update_layout(
    title="Number of dataset downloads over time",
    xaxis_title="date",
    yaxis_title="number of downloads"
)
             
fig.show()

## Most downloaded datasets

### Raw downloads

In [76]:
top_ds_raw = combined[['dataset_id', 'dataset_name','download_dt','remoteip']].drop_duplicates()
top_ds_raw['raw downloads'] = 1
top_ds_raw = top_ds_raw[['dataset_id', 'dataset_name', 'raw downloads']].groupby(['dataset_id', 'dataset_name']).sum().reset_index()
top_ds_raw = top_ds_raw.sort_values(by=['raw downloads'], ascending = False).reset_index(drop = True)
top_ds_raw[0:10]


Unnamed: 0,dataset_id,dataset_name,raw downloads
0,f72958f5-7f42-4ebb-98da-445b0c6de516,Azimuth meta-analysis of 10 datasets of health...,66
1,66d15835-5dc8-4e96-b0eb-f48971cb65e8,Single cell transcriptome analysis of human pa...,65
2,9df60c57-fdf3-4e93-828e-fe9303f20438,Single cell transcriptional and chromatin acce...,60
3,21d3e683-80a4-4d9b-bc89-ebb2df513dde,Time-resolved Systems Immunology Reveals a Lat...,53
4,13a027de-ea3e-432b-9a5e-6bc7048498fc,Single cell transcriptional and chromatin acce...,47
5,b83559d1-156f-4ba9-9f6a-b165f83ef43f,Single-cell RNA-Seq Investigation of Foveal an...,36
6,b07e5164-baf6-43d2-bdba-5a249d0da879,A Single-Cell Transcriptome Atlas of the Human...,35
7,53d208b0-2cfd-4366-9866-c3c6114081bc,Tabula Sapiens - All Cells,34
8,6cda3b13-7257-45b9-ac20-0a7e6697e4f2,scRNA-seq data analysis of HUVECs treated with...,29
9,30cd5311-6c09-46c9-94f1-71fe4b91813c,Time-resolved Systems Immunology Reveals a Lat...,29


### Normalized

In [143]:
top_ds_norm = combined[['dataset_id', 'dataset_name','download_dt','remoteip']].drop_duplicates()
top_ds_norm['raw downloads'] = 1
top_ds_norm = top_ds_norm.groupby(['dataset_id', 'dataset_name']).sum().reset_index()
top_ds_norm = top_ds_norm.merge(datasets_df[['dataset_id', 'dataset_dt']], how = 'inner', on = 'dataset_id')

top_ds_norm['date_since'] = top_ds_norm['dataset_dt']
top_ds_norm.loc[top_ds_norm['dataset_dt'] < '2021-04-27', 'date_since'] = datetime.datetime(2021, 4, 27)
top_ds_norm['date_since'] = top_ds_norm['date_since'].dt.date

top_ds_norm['days_on_platform'] = datetime.date.today() - top_ds_norm['date_since']
top_ds_norm['days_on_platform'] = top_ds_norm['days_on_platform'].dt.days
top_ds_norm['value'] = ((top_ds_norm['raw downloads']/top_ds_norm['days_on_platform'])*100).astype(int)

top_ds_norm = top_ds_norm[['dataset_id', 'dataset_name', 'raw downloads', 'value']].sort_values(by=['value'], ascending = False).reset_index(drop = True)
top_ds_norm[0:10]

Unnamed: 0,dataset_id,dataset_name,raw downloads,value
0,53d208b0-2cfd-4366-9866-c3c6114081bc,Tabula Sapiens - All Cells,34,94
1,f72958f5-7f42-4ebb-98da-445b0c6de516,Azimuth meta-analysis of 10 datasets of health...,66,86
2,66d15835-5dc8-4e96-b0eb-f48971cb65e8,Single cell transcriptome analysis of human pa...,65,67
3,9df60c57-fdf3-4e93-828e-fe9303f20438,Single cell transcriptional and chromatin acce...,60,50
4,21d3e683-80a4-4d9b-bc89-ebb2df513dde,Time-resolved Systems Immunology Reveals a Lat...,53,44
5,13a027de-ea3e-432b-9a5e-6bc7048498fc,Single cell transcriptional and chromatin acce...,47,39
6,5a11f879-d1ef-458a-910c-9b0bdfca5ebf,Tabula Sapiens - Endothelial,13,35
7,9dbab10c-118d-496b-966a-67f1763a6b7d,Large-scale single-cell analysis reveals criti...,27,33
8,c5d88abe-f23a-45fa-a534-788985e93dad,Tabula Sapiens - Immune,12,32
9,a68b64d8-aee3-4947-81b7-36b8fe5a44d2,Tabula Sapiens - Stromal,12,32


## Download pattern of datasets

In [199]:
def create_trace_cum_daily(dataset_id):
    output = downloads_df[downloads_df['dataset_id'] == dataset_id]
    output = output[['download_dt', 'remoteip']].drop_duplicates()
    output['count'] = 1
    output = output[['download_dt', 'count']].groupby(['download_dt']).sum().reset_index()
    output['cum_downloads'] = output['count'].cumsum()
    return (output['download_dt'].tolist(), output['cum_downloads'].tolist())

def add_trace(fig, x_list, y_list, label):
    fig.add_trace(go.Scatter(x=x_list, y=y_list,
                    mode='lines+markers', 
                    hovertemplate = 
                            '<i>Date</i>: %{x}'+
                            '<br>Downloads</i>: %{y}<br>'+
                            '<br>%{text}</i>', text = [label for i in range(len(x_list))]))

    
def dataset_download_trends(d_ids, d_names, title):
    x_lists = []
    y_lists = []
    for id in d_ids:
        x, y = create_trace_cum_daily(id)

        x_lists.append(x)
        y_lists.append(y)
    
    fig = go.Figure()

    for i in range(len(d_ids)):
        add_trace(fig, x_lists[i], y_lists[i], d_names[i])

    fig.update_layout(
        title=title,
        xaxis_title="date",
        yaxis_title="number of downloads",
        showlegend = False
    )
    
    fig.show()

### Raw downloads
* Actively downloaded: downloaded 25 times or more
* Moderately downloaded: downloaded 10 to 25 times
* Lightly downloaded: downloaded less than 10 times

#### Evaluate download groups

In [144]:
# evaluate how to define download groups 
x = top_ds_raw['raw downloads'].tolist()
fig = go.Figure(data=[go.Histogram(x=x)])
fig.update_layout(
        xaxis_title="number of downloads",
        yaxis_title="count",
        showlegend = False
    )
fig.show()

##### Actively downloaded

In [200]:
top10_raw = top_ds_raw[0:10]
active_raw_ids = top10_raw['dataset_id'].tolist()
active_raw_names = top10_raw['dataset_name'].tolist()

dataset_download_trends(active_raw_ids, active_raw_names, 'Trends of top 10 actively downloaded datasets (cumulative) - raw downloads')


##### Moderately downloaded

In [201]:
top10_raw = top_ds_raw[top_ds_raw['raw downloads'] <25][0:10]
active_raw_ids = top10_raw['dataset_id'].tolist()
active_raw_names = top10_raw['dataset_name'].tolist()

dataset_download_trends(active_raw_ids, active_raw_names, 'Trends of top 10 moderately downloaded datasets (cumulative) - raw downloads')


##### Lightly downloaded

In [202]:
top10_raw = top_ds_raw[top_ds_raw['raw downloads'] <10][0:10]
active_raw_ids = top10_raw['dataset_id'].tolist()
active_raw_names = top10_raw['dataset_name'].tolist()

dataset_download_trends(active_raw_ids, active_raw_names, 'Trends of top 10 lightly downloaded datasets (cumulative) - raw downloads')


### Normalized downloads
* Actively downloaded: downloaded more than 10 times
* Moderately downloaded: downloaded 6 to 10 times
* Lightly downloaded: downloaded 5 or less than times

In [192]:
#len(top_ds_norm[(top_ds_norm['value'] > 7) & (top_ds_norm['value'] < 12)]['dataset_id'])
#len(top_ds_norm[(top_ds_norm['value'] < 5)]['dataset_id'])

#### Evaluate download groups

In [182]:
# evaluate how to define download groups 
x = top_ds_norm['value'].tolist()
fig = go.Figure(data=[go.Histogram(x=x)])
fig.update_layout(
        xaxis_title="normalized value",
        yaxis_title="count",
        showlegend = False
    )
fig.show()

##### Actively downloaded

In [203]:
top10_norm = top_ds_norm[0:10]
active_norm_ids = top10_norm['dataset_id'].tolist()
active_norm_names = top10_norm['dataset_name'].tolist()

dataset_download_trends(active_norm_ids, active_norm_names, 'Trends of top 10 actively downloaded datasets (cumulative) - normalized')


##### Moderately downloaded

In [211]:
top10_norm = top_ds_norm[top_ds_norm['value'] < 11][0:10]
active_norm_ids = top10_norm['dataset_id'].tolist()
active_norm_names = top10_norm['dataset_name'].tolist()

dataset_download_trends(active_norm_ids, active_norm_names, 'Trends of top 10 moderately downloaded datasets (cumulative) - normalized')


##### Lightly downloaded

In [213]:
top10_norm = top_ds_norm[top_ds_norm['value'] < 6][0:10]
active_norm_ids = top10_norm['dataset_id'].tolist()
active_norm_names = top10_norm['dataset_name'].tolist()

dataset_download_trends(active_norm_ids, active_norm_names, 'Trends of top 10 moderately downloaded datasets (cumulative) - normalized')


## Most active users

#### Number of unique users

In [60]:
len(combined['remoteip'].drop_duplicates().tolist())

332

In [169]:
users = combined[['dataset_id', 'dataset_name','download_dt','remoteip']].drop_duplicates()
users['count'] = 1
users = users[['remoteip', 'count']].groupby(['remoteip']).sum().reset_index()
users= users.sort_values(by=['count'], ascending = False).reset_index(drop = True)
users['total'] = users['count'].sum()
users['perc'] = round(users['count']/users['total'], 2)
users[['remoteip', 'count', 'perc']][:10]

Unnamed: 0,remoteip,count,perc
0,23.119.165.103,516,0.29
1,129.79.37.84,114,0.07
2,149.165.234.129,73,0.04
3,98.210.3.15,65,0.04
4,149.165.234.130,63,0.04
5,149.159.219.125,52,0.03
6,149.165.234.206,43,0.02
7,129.79.247.78,43,0.02
8,50.18.239.242,27,0.02
9,47.189.207.57,24,0.01


## Downloads by dataset characteristic (WIP)

In [34]:
df_char = downloads_df[['dataset_id', 'download_dt', 'filetype', 'remoteip', 'download_agent']]
df_char['count'] = 1
df_char = df_char.merge(datasets_df, how = 'inner', on = 'dataset_id')

In [35]:
ds_ethnicity = datasets_df[['ethnicity']].explode('ethnicity').reset_index(drop = True) 
ds_ethnicity['total'] = 1
ds_ethnicity = ds_ethnicity.groupby(['ethnicity']).sum().reset_index()

In [36]:
df_ethnicity = df_char[['ethnicity', 'count']].explode('ethnicity').reset_index(drop = True)
df_ethnicity = df_ethnicity.groupby(['ethnicity']).sum().reset_index()

df_ethnicity = df_ethnicity.merge(ds_ethnicity, how = 'inner', on = 'ethnicity')
df_ethnicity['normalized'] = round(df_ethnicity['count']/df_ethnicity['total'])
df_ethnicity

Unnamed: 0,ethnicity,count,total,normalized
0,African American,178,8,22.0
1,Asian,98,8,12.0
2,Chinese,29,1,29.0
3,East Asian,93,2,46.0
4,European,452,20,23.0
5,Finnish,23,1,23.0
6,Hispanic or Latin American,225,6,38.0
7,male,83,1,83.0
8,na,457,68,7.0
9,unknown,1191,77,15.0


In [37]:
fig = go.Figure([go.Bar(x=df_ethnicity['ethnicity'].tolist(), y=df_ethnicity['normalized'].tolist(), 
                        text=df_ethnicity['normalized'].tolist(), textposition='outside')])

fig.show()