In [1]:
from datetime import datetime
import gzip
import tarfile 

import altair as alt
import dask.dataframe as dd
import numpy as np
import pandas as pd

In [2]:
# Country codes, names, continents
countries = pd.read_csv(
    "https://pkgstore.datahub.io/JohnSnowLabs/country-and-continent-codes-list/country-and-continent-codes-list-csv_csv/data/b7876b7f496677669644f3d1069d3121/country-and-continent-codes-list-csv_csv.csv"
)

In [3]:
country_income_classification = pd.read_excel('https://databank.worldbank.org/data/download/site-content/CLASS.xlsx', sheet_name='List of economies', usecols=range(4), nrows=218).fillna('Unknown')
class_codes = {
    'High income': 'HIC', 
    'Low income': 'LIC', 
    'Lower middle income': 'LMIC',
    'Upper middle income': 'UMIC', 
    'Unknown': 'UNK'
}
country_income_classification['income_code'] = country_income_classification['Income group'].apply(lambda c: class_codes[c])


In [4]:
first_line_seen = False
total_genomes = 0
with gzip.open('metadata_2022-03-24_22-57.tsv.gz', 'rb') as infile:
    with open('metadata.tsv', 'w') as outfile:
        for line in infile:
            line_str = line.decode('utf')
            if not first_line_seen:
                outfile.write(line_str)
                first_line_seen = True
            fields = line_str.split('\t')
            total_genomes += 1
            if fields[5] == 'Africa':
                outfile.write(line_str)

In [5]:
def handle_two_part_dates(date):
    parts = date.split("-")
    if len(parts) == 2:
        parts.append("1")
    date = "-".join(parts)
    return date
africa_metadata = pd.read_csv('metadata.tsv', delimiter='\t', index_col=0)
africa_metadata.date = africa_metadata.date.apply(handle_two_part_dates)
africa_metadata

Unnamed: 0_level_0,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,location,region_exposure,country_exposure,...,pango_lineage,GISAID_clade,originating_lab,submitting_lab,authors,url,title,paper_url,date_submitted,purpose_of_sequencing
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Algeria/00003/2021,ncov,EPI_ISL_5052208,?,2021-08-01,Africa,Algeria,Ouargla,,Africa,Algeria,...,B.1.617.2,GK,"NIC, Viral Respiratory Unit",Virology Departement,Fayez Khardine et al,?,?,?,2021-10-11,
Algeria/00018/2021,ncov,EPI_ISL_5052207,?,2021-07-30,Africa,Algeria,Touggourt,,Africa,Algeria,...,AY.20,GK,"NIC, Viral Respiratory Unit",Virology Departement,Fayez Khardine et al,?,?,?,2021-10-11,
Algeria/01064/2021,ncov,EPI_ISL_5052202,?,2021-06-15,Africa,Algeria,Sidi Bel Abbes,,Africa,Algeria,...,B.1.1.7,GR,"NIC, Viral Respiratory Unit",Virology Departement,Fayez Khardine et al,?,?,?,2021-10-11,
Algeria/02615/2021,ncov,EPI_ISL_5052210,?,2021-07-11,Africa,Algeria,Sidi Bel Abbes,,Africa,Algeria,...,AY.100,GK,"NIC, Viral Respiratory Unit",Virology Departement,Fayez Khardine et al,?,?,?,2021-10-11,
Algeria/03594/2021,ncov,EPI_ISL_5052209,?,2021-08-14,Africa,Algeria,Biskra,,Africa,Algeria,...,AY.125,G,"NIC, Viral Respiratory Unit",Virology Departement,Fayez Khardine et al,?,?,?,2021-10-11,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
senegal/SN-IRMBO-003411/2021,ncov,EPI_ISL_6274669,?,2021-10-21,Africa,Senegal,Dakar,Diamniadio,Africa,Senegal,...,AY.25,GK,Iressef Genomics lab,IRESSEF,Souleymane MBOUP et al,?,?,?,2021-11-15,
senegal/SN-IRVAC-015/2021,ncov,EPI_ISL_6274754,?,2021-07-16,Africa,Senegal,Dakar,Diamniadio,Africa,Senegal,...,,G,Iressef Genomics lab,IRESSEF,Souleymane MBOUP et al,?,?,?,2021-11-15,
senegal/SN-IRVAC-140-CON/2021,ncov,EPI_ISL_6274744,?,2021-07-18,Africa,Senegal,Dakar,Diamniadio,Africa,Senegal,...,AY.33,GK,Iressef Genomics lab,IRESSEF,Souleymane MBOUP et al,?,?,?,2021-11-15,
senegal/SN-IRVAC-157-CON/2021,ncov,EPI_ISL_6274738,?,2021-07-18,Africa,Senegal,Dakar,Diamniadio,Africa,Senegal,...,AY.34,GK,Iressef Genomics lab,IRESSEF,Souleymane MBOUP et al,?,?,?,2021-11-15,


In [6]:
#ensure that date is valid before any further processing
def is_date_valid(date):
    try:
        datetime.strptime(date, "%Y-%m-%d")
    except ValueError:
        return False
    return True
    
africa_metadata = africa_metadata[africa_metadata.date.apply(is_date_valid)]

In [7]:
# africa_metadata = gisaid_metadata[gisaid_metadata.region == "Africa"]
# only retain things with good dates
africa_metadata = africa_metadata[
    africa_metadata.apply(lambda x: len(x["date"].split("-")) == 3, axis=1)
]
# drop things without a Nextstrain clade - these are typically poor quality
africa_metadata = africa_metadata[africa_metadata.Nextstrain_clade.notna()]

# fix up a submitting lab names
africa_metadata.loc[
    africa_metadata.submitting_lab
    == "KRISP, KZn Research Innovation and Sequencing Platform",
    "submitting_lab",
] = "KRISP, KZN Research Innovation and Sequencing Platform"
africa_metadata.loc[africa_metadata.submitting_lab.isin(
    ['CERI, Centre for Epidemic Response and Innvoation, Stellenbosch University and KRISP, KZN Research Innovation and Sequencing Platform, UKZN.',
    'CERI, Centre for Epidemic Response and Innovation, Stellenbosch Univeristy & KRISP, KZN Research Innovation and Sequencing Platform',
    'CERI, Centre for Epidemic Response and Innovation, Stellenbosch University and KRISP, KZN Research Innovation and Sequencing Platform, UKZN.'
    'CERI, Centre for Epidemic Response and Innovation, Stellenbosch University and CERI-KRISP, KZN Research Innovation and Sequencing Platform']), 
                    "submitting_lab"] = 'CERI, Centre for Epidemic Response and Innovation'
africa_metadata.loc[africa_metadata.submitting_lab.isin(
    ['National Health Laboratory Service/University of Cape Town (National Health Laboratory Service/University of Cape Town (NHLS/UCT))',
     'National Health Laboratory Service/University of Cape Town (NHLS/UCT)',
     'National Health Laboratory Service/UCT']), "submitting_lab"] = 'NHLS/UCT'
africa_metadata.loc[africa_metadata.submitting_lab.isin(
    ['National Health Laboratory Service (NHLS), Tygerberg',
     'Division of Medical Virology, Stellenbosch University and National Health Laboratory Service (NHLS)',
     'Division of Medical Virology, National Health Laboratory Service (NHLS), Tygerberg Hospital / Stellenbosch University',
     'Stellenbosch University and NHLS',
     'National Health Laboratory Services, Virology']), "submitting_lab"] = 'Division of Medical Virology, Stellenbosch University and NHLS Tygerberg Hospital'
africa_metadata.loc[africa_metadata.submitting_lab == 'ZARV, Department Mdeical Virology, University of Pretoria', 
                    "submitting_lab"] = 'ZARV, Department Medical Virology, University of Pretoria'
africa_metadata.loc[
    africa_metadata.submitting_lab
    == "Where sequence data have been generated and submitted to GISAID",
    "submitting_lab",
] = "MRC/UVRI & LSHTM Uganda Research Unit"
africa_metadata.loc[
    africa_metadata.submitting_lab == "KEMRI-Wellcome Trust Research Programme,Kilifi",
    "submitting_lab",
] = "KEMRI-Wellcome Trust Research Programme/KEMRI-CGMR-C Kilifi"
africa_metadata.loc[
    (africa_metadata.country == "Nigeria")
    & (
        africa_metadata.submitting_lab.str.startswith(
            "African Centre of Excellence for Genomics of Infectious Diseases"
        )
    ),
    "submitting_lab",
] = "ACEGID, African Centre of Excellence for Genomics of Infectious Diseases, Redeemer’s University, Ede"
africa_metadata.loc[
    africa_metadata.submitting_lab == "Redeemer's University, ACEGID", "submitting_lab"
] = "ACEGID, African Centre of Excellence for Genomics of Infectious Diseases, Redeemer’s University, Ede"
africa_metadata.loc[
    (africa_metadata.country == "Nigeria")
    & (africa_metadata.submitting_lab.str.startswith("National")),
    "submitting_lab",
] = "NCDC, National Reference Laboratory, Nigeria Centre for Disease Control, Gaduwa, Abuja, Nigeria"
africa_metadata.loc[
    (africa_metadata.country == "Democratic Republic of the Congo")
    & (
        africa_metadata.submitting_lab
        == "Pathogen Sequencing Lab, National Institute for Biomedical Research (INRB)"
    ),
    "submitting_lab",
] = "INRB, Pathogen Sequencing Lab, National Institute for Biomedical Research"

# add date year / month fields
africa_metadata["date_yearmon"] = africa_metadata.apply(
    lambda r: "-".join(r.date.split("-")[:-1]), axis=1
)
africa_metadata["date_submitted_yearmon"] = africa_metadata.apply(
    lambda r: "-".join(r.date_submitted.split("-")[:-1]), axis=1
)

# calculate the number of days between sample collection and sample submission
africa_metadata["days_to_submit"] = africa_metadata.apply(
    lambda r: int(
        (
            datetime.strptime(r.date_submitted, "%Y-%m-%d")
            - datetime.strptime(r.date, "%Y-%m-%d")
        ).total_seconds()
        // (3600 * 24)
    ),
    axis=1,
)

In [8]:
country_lookup = {}
country_lookup['Democratic Republic of the Congo'] = 'COD'
country_lookup['Cabo Verde'] = 'CPV'
country_lookup["Côte d'Ivoire"] = 'CIV'
country_lookup['Eswatini'] = 'SWZ'
country_lookup['Republic of the Congo'] = 'COG'
country_lookup['Union of the Comoros'] = 'COM'
for country_name in africa_metadata.country.unique():
    if len(countries[countries.Country_Name.str.contains(country_name)]) != 0:
        country_lookup[country_name] = str(countries[countries.Country_Name.str.contains(country_name)].Three_Letter_Country_Code.iloc[0])
    elif country_name in country_lookup:
        pass
    else:
        print(country_name)
africa_metadata['country_iso_3'] = africa_metadata.country.apply(lambda c: country_lookup[c])

africa_metadata['income_group'] = africa_metadata.reset_index().set_index('country_iso_3').join(country_income_classification.set_index('Code')).set_index('strain')['Income group']

In [9]:
# make a dict mapping GISAID country name to ISO 2 letter code
name_to_two_letter_code = {
    "Union of the Comoros": "KM",
    "Republic of the Congo": "CG",
    "Côte d'Ivoire": "CI",
    "Democratic Republic of the Congo": "CD",
    "Eswatini": "SZ",
    "Guinea": "GN",
}
for country_name in africa_metadata.country.unique():
    country_info = countries[countries.Country_Name.str.contains(country_name)]
    if len(country_info) == 1:
        name_to_two_letter_code[country_name] = country_info.iloc[
            0
        ].Two_Letter_Country_Code
country_info

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
257,Africa,AF,"Zambia, Republic of",ZM,ZMB,894.0


In [10]:
alt.Chart(
    africa_metadata.groupby(["country", "submitting_lab"])
    .count()
    .reset_index()
    .sort_values("virus", ascending=False)
).mark_bar().encode(
    x=alt.X("country", title="Country", sort="-y"),
    y=alt.Y("sum(virus)", title="Number of Samples"),
    color=alt.Color("submitting_lab", sort="-y"),
    tooltip=["submitting_lab", "sum(virus)"],
)

In [11]:
alt.Chart(
    africa_metadata.groupby(["date_yearmon", "country"]).count().reset_index(),
    width=600,
).mark_bar().encode(
    x=alt.X("date_yearmon:O", title="Sample Date"),
    y=alt.Y("virus", title="Number of Samples"),
    color=alt.Color("country", sort="-y"),
    tooltip="country",
)

In [12]:
alt.Chart(
    africa_metadata.groupby(["date_submitted_yearmon", "country"])
    .count()
    .reset_index(),
    width=600,
).mark_bar().encode(
    x=alt.X("date_submitted_yearmon:O", title="Submission Date"),
    y=alt.X("virus", title="Number of Samples"),
    color=alt.Color("country", sort="-y"),
    tooltip="country",
)

In [13]:
data = africa_metadata.groupby(["country"]).days_to_submit.mean().to_frame()
data.reset_index().country.apply(lambda c: africa_metadata[africa_metadata.country == c].iloc[0].income_group)
data = pd.concat([data.reset_index().rename({'country': 'cnt'}, axis=1), data.reset_index().country.apply(lambda c: africa_metadata[africa_metadata.country == c].iloc[0].income_group)], axis=1).rename({'cnt': 'country', 'country': 'income_category'}, axis=1)

alt.Chart(
    data
).mark_bar().encode(
    x=alt.X("country", title="Country"),
    y=alt.Y("days_to_submit", title="Days to submit sample to GISAID"), color='income_category',
)

In [14]:
alt.Chart(
    data.sort_values('days_to_submit')
).mark_bar().encode(
    x=alt.X("country", title="Country", sort='y'),
    y=alt.Y("days_to_submit", title="Days to submit sample to GISAID"), color='income_category',
)


In [15]:
def date_greater_filter(date, cutoff):
    date_parts = date.split('-')
    date_day = int(date_parts[2]) if len(date_parts) == 3 else 0
    date_month = int(date_parts[1]) if len(date_parts) >= 2 else 0
    date_year = int(date_parts[0])
    return [date_year, date_month, date_day] >= cutoff

def date_lesser_filter(date, cutoff):
    return not date_greater_filter(date, cutoff)

data = africa_metadata[africa_metadata.apply(lambda r:date_lesser_filter(r['date_submitted'], [2022,3,1]), axis=1)]
alt.Chart(data.groupby('date_submitted_yearmon').count().reset_index(), title="Number of Genomes Per Month submitted to GISAID from Africa", width=600).mark_line(interpolate="monotone").encode(
    alt.X("date_submitted_yearmon", title="Year / Month"), 
    alt.Y('virus', title="Number of Genomes"))

## Work in Progress

Everything below here is a work in progress

In [16]:
data = africa_metadata.sort_values('date_submitted').tail(5000)
alt.Chart(data).mark_point().encode(x='date', y='country', color='Nextstrain_clade').configure_view(
    step=10
)

In [17]:
africa_metadata.sort_values('date_submitted').tail(5000)

Unnamed: 0_level_0,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,location,region_exposure,country_exposure,...,url,title,paper_url,date_submitted,purpose_of_sequencing,date_yearmon,date_submitted_yearmon,days_to_submit,country_iso_3,income_group
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SouthAfrica/NHLS-UCT-GS-AZ37/2022,ncov,EPI_ISL_10815251,?,2022-01-25,Africa,South Africa,Western Cape Province,,Africa,South Africa,...,?,?,?,2022-03-08,,2022-01,2022-03,42,ZAF,Upper middle income
SouthAfrica/NHLS-UCT-LA-Z883/2022,ncov,EPI_ISL_10815330,?,2022-02-09,Africa,South Africa,Western Cape Province,,Africa,South Africa,...,?,?,?,2022-03-08,,2022-02,2022-03,27,ZAF,Upper middle income
SouthAfrica/NHLS-UCT-GS-BD04/2022,ncov,EPI_ISL_10815309,?,2022-02-08,Africa,South Africa,Western Cape Province,,Africa,South Africa,...,?,?,?,2022-03-08,,2022-02,2022-03,28,ZAF,Upper middle income
SouthAfrica/NHLS-UCT-GS-BD06/2022,ncov,EPI_ISL_10815310,?,2022-02-08,Africa,South Africa,Western Cape Province,,Africa,South Africa,...,?,?,?,2022-03-08,,2022-02,2022-03,28,ZAF,Upper middle income
SouthAfrica/NHLS-UCT-GS-BD07/2022,ncov,EPI_ISL_10815311,?,2022-02-08,Africa,South Africa,Western Cape Province,,Africa,South Africa,...,?,?,?,2022-03-08,,2022-02,2022-03,28,ZAF,Upper middle income
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SouthAfrica/1069036/2022,ncov,EPI_ISL_11325914,?,2022-02-21,Africa,South Africa,Limpopo,,Africa,South Africa,...,?,?,?,2022-03-23,,2022-02,2022-03,30,ZAF,Upper middle income
SouthAfrica/1068663/2022,ncov,EPI_ISL_11325913,?,2022-02-23,Africa,South Africa,Limpopo,,Africa,South Africa,...,?,?,?,2022-03-23,,2022-02,2022-03,28,ZAF,Upper middle income
SouthAfrica/1068658/2022,ncov,EPI_ISL_11325912,?,2022-02-17,Africa,South Africa,Limpopo,,Africa,South Africa,...,?,?,?,2022-03-23,,2022-02,2022-03,34,ZAF,Upper middle income
SouthAfrica/1063016/2022,ncov,EPI_ISL_11325904,?,2022-01-11,Africa,South Africa,Limpopo,,Africa,South Africa,...,?,?,?,2022-03-23,,2022-01,2022-03,71,ZAF,Upper middle income


In [18]:
data.columns

Index(['virus', 'gisaid_epi_isl', 'genbank_accession', 'date', 'region',
       'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'Nextstrain_clade', 'pango_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'purpose_of_sequencing', 'date_yearmon',
       'date_submitted_yearmon', 'days_to_submit', 'country_iso_3',
       'income_group'],
      dtype='object')