This notebook uses [Altair](https://altair-viz.github.io/), [Pandas](https://pandas.pydata.org/) and [epiweeks](https://epiweeks.readthedocs.io/) packages. All can be installed using [conda](https://docs.conda.io/en/latest/).

The data in this notebook is a combination of open data (from Our World In Data and elsewhere) and metadata about genomic data submitted to [GISAID](https://www.gisaid.org/). The open data is loaded automatically but to get the GISAID data you need to log in, go to the EpiCov tab, select Downloads and download the `metadata` file in the 'Genomic epidemiology' section. You then need to edit the 4th code cell in this notebook and put the path of the metadata file there.

In [169]:
from datetime import datetime

import altair as alt
import pandas as pd
import epiweeks

In [170]:
# Country codes, names, continents
countries = pd.read_csv(
    "https://pkgstore.datahub.io/JohnSnowLabs/country-and-continent-codes-list/country-and-continent-codes-list-csv_csv/data/b7876b7f496677669644f3d1069d3121/country-and-continent-codes-list-csv_csv.csv"
)
countries

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0
4,Oceania,OC,American Samoa,AS,ASM,16.0
...,...,...,...,...,...,...
257,Africa,AF,"Zambia, Republic of",ZM,ZMB,894.0
258,Oceania,OC,Disputed Territory,XX,,
259,Asia,AS,Iraq-Saudi Arabia Neutral Zone,XE,,
260,Asia,AS,United Nations Neutral Zone,XD,,


In [171]:
# Our World In Data COVID-19 info
owid_data = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv")
owid_data["year_mon"] = owid_data.apply(
    lambda r: "-".join(r.date.split("-")[:-1]), axis=1
)
owid_data

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality,year_mon
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,597.029,9.59,,,37.746,0.5,64.83,0.511,,2020-02
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,597.029,9.59,,,37.746,0.5,64.83,0.511,,2020-02
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,597.029,9.59,,,37.746,0.5,64.83,0.511,,2020-02
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,597.029,9.59,,,37.746,0.5,64.83,0.511,,2020-02
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,597.029,9.59,,,37.746,0.5,64.83,0.511,,2020-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107892,ZWE,Africa,Zimbabwe,2021-08-03,112435.0,1580.0,1532.000,3676.0,41.0,56.571,...,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571,,2021-08
107893,ZWE,Africa,Zimbabwe,2021-08-04,113526.0,1091.0,1422.714,3711.0,35.0,53.000,...,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571,,2021-08
107894,ZWE,Africa,Zimbabwe,2021-08-05,114489.0,963.0,1261.857,3754.0,43.0,47.571,...,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571,,2021-08
107895,ZWE,Africa,Zimbabwe,2021-08-06,115445.0,956.0,1136.429,3805.0,51.0,45.000,...,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571,,2021-08


In [172]:
gisaid_metadata = pd.read_csv(
    "metadata_2021-08-06_11-33.tsv.gz", compression="gzip", delimiter="\t", index_col=0
)
gisaid_metadata

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0_level_0,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,location,region_exposure,country_exposure,...,pango_lineage,GISAID_clade,originating_lab,submitting_lab,authors,url,title,paper_url,date_submitted,purpose_of_sequencing
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan/IMB07956/2020,ncov,EPI_ISL_1034760,?,2020-06-06,Asia,Afghanistan,Afghanistan,,Asia,Afghanistan,...,B.1.1,G,Bundeswehr Institute of Microbiology,Bundeswehr Institute of Microbiology,Markus Antwerpen et al,https://www.epicov.org/acknowledgement/47/60/E...,?,?,2021-02-19,
Afghanistan/IMB07958/2020,ncov,EPI_ISL_1000998,?,2020-06-13,Asia,Afghanistan,Afghanistan,,Asia,Afghanistan,...,B.1.36,GH,Bundeswehr Institute of Microbiology,Bundeswehr Institute of Microbiology,Markus Antwerpen et al,https://www.epicov.org/acknowledgement/09/98/E...,?,?,2021-02-15,
Afghanistan/IMB07960/2020,ncov,EPI_ISL_1000999,?,2020-06-07,Asia,Afghanistan,Afghanistan,,Asia,Afghanistan,...,B.1.9,GH,Bundeswehr Institute of Microbiology,Bundeswehr Institute of Microbiology,Markus Antwerpen et al,https://www.epicov.org/acknowledgement/09/99/E...,?,?,2021-02-15,
Afghanistan/IMB07962/2020,ncov,EPI_ISL_1001000,?,2020-06-02,Asia,Afghanistan,Afghanistan,,Asia,Afghanistan,...,B.1,GH,Bundeswehr Institute of Microbiology,Bundeswehr Institute of Microbiology,Markus Antwerpen et al,https://www.epicov.org/acknowledgement/10/00/E...,?,?,2021-02-15,
Afghanistan/IMB07964/2020,ncov,EPI_ISL_1001001,?,2020-05-30,Asia,Afghanistan,Afghanistan,,Asia,Afghanistan,...,B.1,G,Bundeswehr Institute of Microbiology,Bundeswehr Institute of Microbiology,Markus Antwerpen et al,https://www.epicov.org/acknowledgement/10/01/E...,?,?,2021-02-15,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tiger/USA/TN-20-031353-002/2020,ncov,EPI_ISL_2928445,?,2020-10-19,North America,USA,Tennessee,,North America,USA,...,B.1.2,GH,Cummings School of Veterinary Medicine at Tuft...,"Diagnostic Virology Laboratory, National Veter...",Kerrie M. Franzen et al,https://www.epicov.org/acknowledgement/84/45/E...,?,?,2021-07-13,
tiger/USA/TN-20-031498-003/2020,ncov,EPI_ISL_2928446,?,2020-10-27,North America,USA,Tennessee,,North America,USA,...,B.1.2,GH,Kord Animal Health Diagnostic Laboratory,"Diagnostic Virology Laboratory, National Veter...",Kerrie M. Franzen et al,https://www.epicov.org/acknowledgement/84/46/E...,?,?,2021-07-13,
tiger/USA/TX-21-002026-001/2021,ncov,EPI_ISL_2928449,?,2021-01-16,North America,USA,Texas,,North America,USA,...,B.1.234,G,Texas Veterinary Medical Diagnostic Laboratory,"Diagnostic Virology Laboratory, National Veter...",Kerrie M. Franzen et al,https://www.epicov.org/acknowledgement/84/49/E...,?,?,2021-07-13,
tiger/USA/VA-21-010728-001/2021,ncov,EPI_ISL_2928453,?,2021-04-09,North America,USA,Virginia,,North America,USA,...,B.1.1.7,GRY,Cornell Diagnostic Laboratory,"Diagnostic Virology Laboratory, National Veter...",Kerrie M. Franzen et al,https://www.epicov.org/acknowledgement/84/53/E...,?,?,2021-07-13,


In [173]:
asia_metadata = gisaid_metadata[gisaid_metadata.region == "Asia"]
# only retain things with good dates
asia_metadata = asia_metadata[
    asia_metadata.apply(lambda x: len(x["date"].split("-")) == 3, axis=1)
]
# drop things without a Nextstrain clade - these are typically poor quality
asia_metadata = asia_metadata[asia_metadata.Nextstrain_clade.notna()]

# drop rows where dates have XX in them - these dates are not reliable
asia_metadata = asia_metadata[~asia_metadata.date.str.contains('XX')]

# add date year / month fields
asia_metadata["date_yearmon"] = asia_metadata.apply(
    lambda r: "-".join(r.date.split("-")[:-1]), axis=1
)
asia_metadata["date_submitted_yearmon"] = asia_metadata.apply(
    lambda r: "-".join(r.date_submitted.split("-")[:-1]), axis=1
)

# calculate the number of days between sample collection and sample submission
asia_metadata["days_to_submit"] = asia_metadata.apply(
    lambda r: int(
        (
            datetime.strptime(r.date_submitted, "%Y-%m-%d")
            - datetime.strptime(r.date, "%Y-%m-%d")
        ).total_seconds()
        // (3600 * 24)
    ),
    axis=1,
)

In [174]:
# make a dict mapping GISAID country name to ISO 2 letter code
name_to_two_letter_code = {}
for country_name in asia_metadata.country.unique():
    country_info = countries[countries.Country_Name.str.contains(country_name)]
    if len(country_info) == 1:
        name_to_two_letter_code[country_name] = country_info.iloc[
            0
        ].Two_Letter_Country_Code

In [175]:
alt.Chart(
    asia_metadata.groupby(["country", "submitting_lab"])
    .count()
    .reset_index()
    .sort_values("virus", ascending=False)
).mark_bar().encode(
    x=alt.X("country", title="Country", sort="-y"),
    y=alt.Y("sum(virus)", title="Number of Samples"),
    color=alt.Color("submitting_lab", sort="-y"),
    tooltip=["submitting_lab", "sum(virus)"],
)

In [176]:
alt.Chart(
    asia_metadata.groupby(["country"])
    .count()
    .reset_index()
    .sort_values("virus", ascending=False)
).mark_bar().encode(
    x=alt.X("country", title="Country", sort="-y"),
    y=alt.Y("sum(virus)", title="Number of Samples"),
    color=alt.Color("submitting_lab", sort="-y"),
    tooltip=["submitting_lab", "sum(virus)"],
)

In [177]:
indonesia_metadata = asia_metadata[asia_metadata.country == 'Indonesia']

In [178]:
top_num=20
alt.Chart(
    indonesia_metadata.groupby(["submitting_lab"])
    .count()
    .reset_index()
    .sort_values("virus", ascending=False)
    .head(top_num), title='Top {} labs in Indonesia submitting to GISAID'.format(top_num)
).mark_bar().encode(
    x=alt.X("submitting_lab", title="Laboratory", sort="-y"),
    y=alt.Y("sum(virus)", title="Number of Samples"),
    color=alt.Color("submitting_lab", sort="-y", legend=alt.Legend(labelLimit=400)),
    tooltip=["submitting_lab", "sum(virus)"],
)

In [179]:
alt.Chart(
    indonesia_metadata.groupby(["date_yearmon", "submitting_lab"]).count().reset_index(),
    width=600,
).mark_bar().encode(
    x=alt.X("date_yearmon:O", title="Sample Date"),
    y=alt.Y("virus", title="Number of Samples"),
    color=alt.Color("submitting_lab", sort="-y", legend=alt.Legend(labelLimit=400)),
    tooltip="country",
)

In [180]:
alt.Chart(
    indonesia_metadata.groupby(["date_submitted_yearmon", "submitting_lab"])
    .count()
    .reset_index(),
    width=600,
).mark_bar().encode(
    x=alt.X("date_submitted_yearmon:O", title="Submission Date"),
    y=alt.X("virus", title="Number of Samples"),
    color=alt.Color("submitting_lab", sort="-y", legend=alt.Legend(labelLimit=400)),
    tooltip="country",
)

In [181]:
alt.Chart(
    indonesia_metadata.groupby(["submitting_lab"]).days_to_submit.mean().reset_index()
).mark_bar().encode(
    x=alt.X("submitting_lab", title="Submitting Lab"),
    y=alt.Y("days_to_submit", title="Average Days to submit sample to GISAID"),
)

In [182]:
indonesia_metadata.columns

Index(['virus', 'gisaid_epi_isl', 'genbank_accession', 'date', 'region',
       'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'Nextstrain_clade', 'pango_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'purpose_of_sequencing', 'date_yearmon',
       'date_submitted_yearmon', 'days_to_submit'],
      dtype='object')

In [183]:
top_5_labs = indonesia_metadata.groupby(['submitting_lab']).count().sort_values('virus', ascending=False).head(5).reset_index().submitting_lab.tolist()
alt.Chart(
    indonesia_metadata[
        indonesia_metadata.submitting_lab.isin(
            top_5_labs
        )
    ], title='Time to submit to GISAID for top 5 labs in Indonesia'
).transform_density(
    "days_to_submit",
    as_=["days_to_submit", "density"],
    extent=[0, 400],
    groupby=["submitting_lab"],
).mark_area(
    orient="horizontal"
).encode(
    y=alt.Y("days_to_submit:Q", title="Days to submit to GISAID"),
    color="submitting_lab:N",
    x=alt.X(
        "density:Q",
        stack="center",
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0], grid=False, ticks=True),
    ),
    column=alt.Column(
        "submitting_lab:N",
        header=alt.Header(
            titleOrient="bottom",
            labelOrient="bottom",
            labelPadding=200,
            labelAngle=-90,
            title="Submitting Lab",
        ),
    ),
).properties(
    width=100
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

In [184]:
gisaid_data = (
    indonesia_metadata.groupby(["date_yearmon", "pango_lineage"])
    .count()
    .reset_index()
)
idn_owid_data = (
    owid_data[owid_data.iso_code == "IDN"]
    .groupby(["year_mon"])
    .new_cases.mean()
    .reset_index()
)
gisaid = (
    alt.Chart(gisaid_data, width=600, title="SARS-CoV-2 surveillance: Indonesia")
    .mark_bar()
    .encode(
        alt.X("date_yearmon", title="Year / Month"),
        y=alt.Y("sum(virus)", title="Samples by Date of Sample"),
        color=alt.Color("pango_lineage", sort="-y"),
        tooltip=["pango_lineage", "virus"],
    )
)
owid = (
    alt.Chart(idn_owid_data)
    .mark_line(stroke="red", interpolate="monotone")
    .encode(
        alt.X("year_mon", title="Year / Month"),
        alt.Y("new_cases", title="New Cases / Day"),
    )
)
alt.layer(gisaid, owid).resolve_scale(y="independent")

In [185]:
gisaid_data = (
    indonesia_metadata
    .groupby(["date_submitted_yearmon", "pango_lineage"])
    .count()
    .reset_index()
)
idn_owid_data = (
    owid_data[owid_data.iso_code == "IDN"]
    .groupby(["year_mon"])
    .new_cases.mean()
    .reset_index()
)
gisaid = (
    alt.Chart(gisaid_data, width=600, title="SARS-CoV-2 surveillance: Indonesia")
    .mark_bar()
    .encode(
        alt.X("date_submitted_yearmon", title="Year / Month"),
        y=alt.Y("sum(virus)", title="Samples by Date of Submission"),
        color=alt.Color("pango_lineage", sort="-y"),
        tooltip=["pango_lineage", "virus"],
    )
)
owid = (
    alt.Chart(idn_owid_data)
    .mark_line(stroke="red", interpolate="monotone")
    .encode(
        alt.X("year_mon", title="Year / Month"),
        alt.Y("new_cases", title="New Cases / Day"),
    )
)
alt.layer(gisaid, owid).resolve_scale(y="independent")

In [186]:
gisaid_metadata.columns

Index(['virus', 'gisaid_epi_isl', 'genbank_accession', 'date', 'region',
       'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'Nextstrain_clade', 'pango_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'purpose_of_sequencing'],
      dtype='object')

In [187]:
# an example of filtering the metadata

# there are 3 filters used here: 
# 1. Geographic: only where "division" is "North Sumatra"
# 2. Lineage: only where pango_lineage is B.1.617.2 (aka VOC Delta)
# 3. Age: only age < 18. This one is a bit more completed because some fields in the "age" column have things like '?' in. So Pandas's "to_numeric" is used to coerce (force) the data to numeric format

indonesia_metadata[(indonesia_metadata.division == 'North Sumatra') & (indonesia_metadata.pango_lineage == 'B.1.617.2') & 
                   (pd.to_numeric(indonesia_metadata.age, errors='coerce') < 18)]

Unnamed: 0_level_0,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,location,region_exposure,country_exposure,...,submitting_lab,authors,url,title,paper_url,date_submitted,purpose_of_sequencing,date_yearmon,date_submitted_yearmon,days_to_submit
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Indonesia/SU-GSILab-621778/2021,ncov,EPI_ISL_3208086,?,2021-07-04,Asia,Indonesia,North Sumatra,Medan,Asia,Indonesia,...,Genomik Solidaritas Indonesia Laboratorium,Reinhart Gabriel et al,https://www.epicov.org/acknowledgement/80/86/E...,?,?,2021-08-03,,2021-07,2021-08,30
Indonesia/SU-GSILab-621793/2021,ncov,EPI_ISL_3208058,?,2021-07-05,Asia,Indonesia,North Sumatra,Medan,Asia,Indonesia,...,Genomik Solidaritas Indonesia Laboratorium,Vania Gavrila Wikasa et al,https://www.epicov.org/acknowledgement/80/58/E...,?,?,2021-08-03,,2021-07,2021-08,29
Indonesia/SU-GSILab-621794/2021,ncov,EPI_ISL_3208063,?,2021-06-25,Asia,Indonesia,North Sumatra,Medan,Asia,Indonesia,...,Genomik Solidaritas Indonesia Laboratorium,Vania Gavrila Wikasa et al,https://www.epicov.org/acknowledgement/80/63/E...,?,?,2021-08-03,,2021-06,2021-08,39
Indonesia/SU-GSILab-621806/2021,ncov,EPI_ISL_3208059,?,2021-07-11,Asia,Indonesia,North Sumatra,Medan,Asia,Indonesia,...,Genomik Solidaritas Indonesia Laboratorium,Vania Gavrila Wikasa et al,https://www.epicov.org/acknowledgement/80/59/E...,?,?,2021-08-03,,2021-07,2021-08,23


In [188]:
indonesia_metadata[indonesia_metadata.originating_lab == 'Universitas Sumatra Utara'].sort_values(['date']).loc[:,['date','pango_lineage']]

Unnamed: 0_level_0,date,pango_lineage
strain,Unnamed: 1_level_1,Unnamed: 2_level_1
Indonesia/SU-GSILab-621807/2021,2021-05-14,B.1.466.2
Indonesia/SU-GSILab-621775/2021,2021-05-17,B.1.466.2
Indonesia/SU-GSILab-621777/2021,2021-05-19,B.1.466.2
Indonesia/SU-GSILab-621774/2021,2021-06-02,B.1.466.2
Indonesia/SU-GSILab-621796/2021,2021-06-02,B.1.466.2
Indonesia/SU-GSILab-621809/2021,2021-06-03,B.1.466.2
Indonesia/SU-GSILab-621791/2021,2021-06-04,B.1.466.2
Indonesia/SU-GSILab-621798/2021,2021-06-04,B.1.466.2
Indonesia/SU-GSILab-621780/2021,2021-06-04,B.1.466.2
Indonesia/SU-GSILab-621797/2021,2021-06-17,B.1.466.2


In [189]:
to_epiyear = lambda s: epiweeks.Week.fromdate(datetime.strptime(s, '%Y-%m-%d')).weektuple()[0]
to_epiweek = lambda s: epiweeks.Week.fromdate(datetime.strptime(s, '%Y-%m-%d')).weektuple()[1]
to_epistartdate = lambda s: epiweeks.Week.fromdate(datetime.strptime(s, '%Y-%m-%d')).startdate().strftime('%b-%d')

In [191]:
for key in ('epiweek', 'epiyear', 'epistartdate'):
    try:
        indonesia_metadata = indonesia_metadata.drop([key], axis=1)
    except KeyError:
        pass
indonesia_metadata.insert(len(indonesia_metadata.columns), 'epiyear', indonesia_metadata.date.apply(to_epiyear))
indonesia_metadata.insert(len(indonesia_metadata.columns), 'epiweek', indonesia_metadata.date.apply(to_epiweek))
indonesia_metadata.insert(len(indonesia_metadata.columns), 'epistartdate', indonesia_metadata.date.apply(to_epistartdate))


In [192]:
start_dates = indonesia_metadata[indonesia_metadata.epiyear > 2020].epistartdate.unique().tolist()
def sd_to_epiweek(s):
    epiweek = epiweeks.Week.fromdate(datetime.strptime(s, '%b-%d'))
    w = epiweek.weektuple()[1]
    return w

sorted_starts = sorted(start_dates, key=sd_to_epiweek)

In [193]:
sorted_starts

['Jan-03',
 'Jan-10',
 'Jan-17',
 'Jan-24',
 'Jan-31',
 'Feb-07',
 'Feb-14',
 'Feb-21',
 'Feb-28',
 'Mar-07',
 'Mar-14',
 'Mar-21',
 'Mar-28',
 'Apr-04',
 'Apr-11',
 'Apr-18',
 'Apr-25',
 'May-02',
 'May-09',
 'May-16',
 'May-23',
 'May-30',
 'Jun-06',
 'Jun-13',
 'Jun-20',
 'Jun-27',
 'Jul-04',
 'Jul-11',
 'Jul-18',
 'Jul-25']

In [194]:
alt.Chart(
    indonesia_metadata[indonesia_metadata.originating_lab == 'Universitas Sumatra Utara']
    .groupby(['epiweek', 'epistartdate', 'pango_lineage']).count().reset_index(),width=400).mark_bar(size=20).encode(
    x=alt.X('epistartdate', title="Start Day of Week when Sample was collected", sort=sorted_starts), 
    y=alt.Y('sum(virus)', title='Sample Count'),
    color='pango_lineage',
    tooltip=['pango_lineage']
)

In [195]:
alt.Chart(
    indonesia_metadata[indonesia_metadata.epiyear > 2020]
    .groupby(['epiweek', 'epistartdate', 'pango_lineage']).count().reset_index(),width=800).mark_bar(size=20).encode(
    x=alt.X('epistartdate', title="Start Day of Week when Sample was collected", sort=sorted_starts), 
    y=alt.Y('sum(virus)', title='Sample Count'),
    color='pango_lineage',
    tooltip=['pango_lineage']
)

In [230]:
alt.Chart(
    indonesia_metadata[indonesia_metadata.epiyear > 2020]
    .groupby(['epiweek', 'epistartdate', 'pango_lineage']).count().reset_index(),width=800).mark_bar(size=20).encode(
    x=alt.X('epiweek', title="Epiweek when Sample was collected", sort=sorted_starts), 
    y=alt.Y('sum(virus)', title='Sample Count'),
    color=alt.Color('pango_lineage', sort='-y'),
    tooltip=['pango_lineage']
)

In [196]:
indonesia_metadata[indonesia_metadata.originating_lab == 'Universitas Sumatra Utara'].groupby(['epiweek']).count()

Unnamed: 0_level_0,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,location,region_exposure,country_exposure,...,url,title,paper_url,date_submitted,purpose_of_sequencing,date_yearmon,date_submitted_yearmon,days_to_submit,epiyear,epistartdate
epiweek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,1,1,1,1,1
20,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,0,2,2,2,2,2
22,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,0,6,6,6,6,6
24,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,1,1,1,1,1
25,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,0,4,4,4,4,4
27,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,0,3,3,3,3,3
28,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,1,1,1,1,1


In [197]:
indonesia_metadata[indonesia_metadata.originating_lab == 'Universitas Sumatra Utara'].epiweek

strain
Indonesia/SU-GSILab-621774/2021    22
Indonesia/SU-GSILab-621775/2021    20
Indonesia/SU-GSILab-621777/2021    20
Indonesia/SU-GSILab-621778/2021    27
Indonesia/SU-GSILab-621780/2021    22
Indonesia/SU-GSILab-621791/2021    22
Indonesia/SU-GSILab-621793/2021    27
Indonesia/SU-GSILab-621794/2021    25
Indonesia/SU-GSILab-621796/2021    22
Indonesia/SU-GSILab-621797/2021    24
Indonesia/SU-GSILab-621798/2021    22
Indonesia/SU-GSILab-621799/2021    25
Indonesia/SU-GSILab-621803/2021    27
Indonesia/SU-GSILab-621805/2021    25
Indonesia/SU-GSILab-621806/2021    28
Indonesia/SU-GSILab-621807/2021    19
Indonesia/SU-GSILab-621808/2021    25
Indonesia/SU-GSILab-621809/2021    22
Name: epiweek, dtype: int64

In [198]:
indonesia_metadata.to_excel('indonesia_metadata.xlsx')

In [217]:
len(indonesia_metadata[(indonesia_metadata.epiweek > 20) & (indonesia_metadata.epiyear == 2021)].originating_lab.unique())

230

In [222]:
alt.Chart(indonesia_metadata[(indonesia_metadata.epiweek > 20) & (indonesia_metadata.epiyear == 2021)].groupby(['division', 'epiweek']).count().reset_index()).mark_bar(size=10).encode(
    x='epiweek',
    y='sum(virus)',
    color='division', tooltip=['division']
)

In [228]:
recent_data = indonesia_metadata[(indonesia_metadata.epiweek > 20) & (indonesia_metadata.epiyear == 2021)]
divs = recent_data.division.unique().tolist()
charts = []
for division in divs:
    chart = alt.Chart(recent_data[recent_data.division == division].groupby(['epiweek','pango_lineage']).count().reset_index(), title=division).mark_bar(width=20).encode(
        x=alt.X('epiweek'),
        y=alt.Y('sum(virus)'),
        color=alt.Color('pango_lineage'),
        tooltip=['pango_lineage']
    )
    charts.append(chart)
alt.vconcat(*charts)

In [199]:
GISAID_contributors = indonesia_metadata.submitting_lab.unique().tolist()
print("The data in this analysis was submitted to GISAID by:", ', \n'.join(GISAID_contributors))

The data in this analysis was submitted to GISAID by: National Institute of Health Research and Development, 
Eijkman Institute for Molecular Biology, National Agency for Research and Innovation; Molecular Biology Laboratory, Faculty Medicine and Health Sciences, Warmadewa University, 
Eijkman Institute for Molecular Biology, National Research and Innovation Agency; Faculty Medicine and Health Sciences, Warmadewa University, 
Diagnostic and Research Center of Infectious Diseases, Medical Faculty, Andalas University, 
Biosafety Level-3 Laboratory, Indonesian Institute of Sciences (LIPI), 
Eijkman Institute for Molecular Biology, Ministry of Research and Technology/National Agency for Research and Innovation, 
Eijkman Institute for Molecular Biology, National Agency for Research and Innovation, 
Eijkman Institute for Molecular Biology, National Research and Innovation Agency, 
Medical Research Center, Faculty of Medicine, Syarif Hidayatullah State Islamic University Jakarta, 
Clinical Mi