In [1]:
import altair as alt
import pandas as pd
from altair_saver import save
import altair_catplot as altcat  # this is from https://github.com/justinbois/altair-catplot and not in conda

from gisaid_utils import *

In [2]:
gisaid_metadata_filename = 'metadata_2022-08-20_00-26.tsv.gz'
extract_africa_metadata(gisaid_metadata_filename)
africa_metadata = get_africa_metadata()

In [3]:
countries_df = get_countries()

In [4]:
income_groups = get_income_groups()

In [5]:
def date_greater_filter(date, cutoff):
    date_parts = date.split('-')
    date_day = int(date_parts[2]) if len(date_parts) == 3 else 0
    date_month = int(date_parts[1]) if len(date_parts) >= 2 else 0
    date_year = int(date_parts[0])
    return [date_year, date_month, date_day] >= cutoff

def date_lesser_filter(date, cutoff):
    return not date_greater_filter(date, cutoff)

days_to_submit_df_2020 = africa_metadata[africa_metadata.apply(
    lambda r:(date_lesser_filter(r['date_submitted'], [2021,1,1]) & date_greater_filter(r['date_submitted'], [2019,12,31])), axis=1)
                                   ].groupby('country').mean().reset_index().sort_values(by='days_to_submit')
days_to_submit_df_2020 = insert_income_groups(days_to_submit_df_2020, countries_df, income_groups, 'country')

days_to_submit_df_2021 = africa_metadata[africa_metadata.apply(
    lambda r:(date_lesser_filter(r['date_submitted'], [2022,1,1]) & date_greater_filter(r['date_submitted'], [2020,12,31])), axis=1)
                                   ].groupby('country').mean().reset_index().sort_values(by='days_to_submit')
days_to_submit_df_2021 = insert_income_groups(days_to_submit_df_2021, countries_df, income_groups, 'country')

days_to_submit_df_2022 = africa_metadata[africa_metadata.apply(
    lambda r:(date_lesser_filter(r['date_submitted'], [2023,1,1]) & date_greater_filter(r['date_submitted'], [2021,12,31])), axis=1)
                                   ].groupby('country').mean().reset_index().sort_values(by='days_to_submit')
days_to_submit_df_2022 = insert_income_groups(days_to_submit_df_2022, countries_df, income_groups, 'country')



In [6]:
def make_chart(df, year):
    chart = alt.Chart(df).mark_bar().encode(x=alt.X("country", title="Country"),
                                            y=alt.Y("days_to_submit", title=f"Days to submit sample to GISAID - {year}"),
                                            color=alt.Color('Income group', sort=['Low income', 'Lower middle income', 'Upper middle income', 'High income']))
    save(chart, f"images/days_to_submit_to_gisaid_{year}.png")
    save(chart, f"images/days_to_submit_to_gisaid_{year}.svg")
    return chart

In [7]:
make_chart(days_to_submit_df_2020, 2020)

In [8]:
make_chart(days_to_submit_df_2021, 2021)

In [9]:
make_chart(days_to_submit_df_2022, 2022)

In [10]:
def print_dts_stats(df, year, cutoff):
    print(f"median days to submit in {year}:", round(df.days_to_submit.median()), "standard deviation:", df.days_to_submit.median())
    print(f"number of countries submitting in under {cutoff} days:", len(df[df.days_to_submit < cutoff].sort_values(by='country')))
    print("number of countries submitting:", len(df))    

In [11]:
print_dts_stats(days_to_submit_df_2020, '2020', 100)

median days to submit in 2020: 116 standard deviation: 116.05125391849529
number of countries submitting in under 100 days: 12
number of countries submitting: 28


In [12]:
print_dts_stats(days_to_submit_df_2021, '2021', 100)

median days to submit in 2021: 105 standard deviation: 104.88906497622821
number of countries submitting in under 100 days: 23
number of countries submitting: 49


In [13]:
print_dts_stats(days_to_submit_df_2022, '2022', 100)

median days to submit in 2022: 128 standard deviation: 127.93392070484582
number of countries submitting in under 100 days: 18
number of countries submitting: 51


In [14]:
median_dts = africa_metadata.groupby('date_submitted').mean()
year_series = median_dts.reset_index().date_submitted.apply(lambda d: d.split('-')[0])
year_series.index = median_dts.index
median_dts.insert(len(median_dts.columns), 'year', year_series)
median_dts.insert(len(median_dts.columns), 'num_sequences', africa_metadata.groupby('date_submitted').count().virus)
median_dts.insert(len(median_dts.columns), 'num_countries', africa_metadata.groupby('date_submitted')['country'].nunique())
median_dts

Unnamed: 0_level_0,length,purpose_of_sequencing,days_to_submit,year,num_sequences,num_countries
date_submitted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-06,29759.000000,,8.000000,2020,1,1
2020-03-15,29621.000000,,6.000000,2020,1,1
2020-03-24,29903.000000,,17.000000,2020,1,1
2020-03-25,29653.200000,,9.000000,2020,10,1
2020-03-27,29770.500000,,8.125000,2020,8,1
...,...,...,...,...,...,...
2022-08-13,29414.549020,,111.656863,2022,102,2
2022-08-15,29639.341880,,62.051282,2022,117,2
2022-08-16,29650.915978,,228.256198,2022,726,1
2022-08-17,29532.359756,,83.993902,2022,328,6


In [15]:
alt.Chart(median_dts.reset_index().sort_values('date_submitted'), width=800).mark_point().encode(
    x=alt.X('date_submitted:T', title='Date Submitted', axis=alt.Axis(labelAngle=-90)),
    y=alt.Y('days_to_submit', title='Mean Days to submit'),
    size=alt.Size('num_sequences', title='Number of Sequences'),
    color=alt.Color('num_countries', scale=alt.Scale(scheme='blues')))

In [16]:
africa_metadata.columns

Index(['virus', 'gisaid_epi_isl', 'genbank_accession', 'date', 'region',
       'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'Nextstrain_clade', 'pango_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'purpose_of_sequencing', 'date_yearmon',
       'date_submitted_yearmon', 'days_to_submit'],
      dtype='object')

In [33]:
horizontal_boxplot = altcat.catplot(median_dts, height=300, width=600,
               encoding=dict(y=alt.Y('year:N', title=None),
                             x=alt.X('days_to_submit:Q', title='Days to Submit'),
                             color=alt.Color('year:N', legend=None)),
               transform='box')
save(horizontal_boxplot, 'images/horizontal_boxplot.png')
save(horizontal_boxplot, 'images/horizontal_boxplot.svg')
horizontal_boxplot

In [22]:
vert_boxplot = altcat.catplot(median_dts, height=600, width=300,
               encoding=dict(x=alt.X('year:N', title=None),
                             y=alt.Y('days_to_submit:Q', title='Days to Submit'),
                             color=alt.Color('year:N', legend=None)),
               transform='box')
save(vert_boxplot, 'images/vertical_boxplot.png')
save(vert_boxplot, 'images/vertical_boxplot.svg')
vert_boxplot

In [25]:
africa_metadata.insert(len(africa_metadata.columns), 'submitted_year', africa_metadata.date_submitted.apply(lambda d: d.split('-')[0]))

In [34]:
# use this with caution - it grows the size of the notebook considerably
# alt.data_transformers.disable_max_rows()
# horizontal_boxplot = altcat.catplot(africa_metadata, height=300, width=600,
#                encoding=dict(y=alt.Y('submitted_year:N', title=None),
#                              x=alt.X('days_to_submit:Q', title='Days to Submit'),
#                              color=alt.Color('submitted_year:N', legend=None)),
#                transform='box')
# save(horizontal_boxplot, 'images/am_horizontal_boxplot.png')
# save(horizontal_boxplot, 'images/am_horizontal_boxplot.svg')
# horizontal_boxplot