In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# read dataset using pandas
df = pd.read_csv("ncbi_datasets.csv", low_memory=False)

In [3]:
df.head()

Unnamed: 0,Nucleotide Accession,Species Taxonomy Id,Species Name,Virus Genus,Virus Family,Isolate Name,Nucleotide Length,Sequence Type,Nuc Completeness,Geo Location,US State,Host Name,Host Taxonomy ID,Collection Date,BioProject,BioSample
0,NC_045512.2,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,Wuhan-Hu-1,29903,RefSeq,complete,Asia; China,,,9606,2019-12,PRJNA485481,
1,OM487257.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/USA/UT-UPHL-220129247957/2021,29815,GenBank,complete,North America; USA,,,9606,2021-12-23,PRJNA614995,SAMN25559487
2,OM403304.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/HKG/VM20001061-2/2020,29870,GenBank,complete,Asia; Hong Kong,,,9606,2020-01-23,,
3,OM403303.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/HKG/Original/2020,29851,GenBank,complete,Asia; Hong Kong,,,9606,2020-01-23,,
4,OM401120.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/USA/UT-UPHL-211112790633/2021,29712,GenBank,complete,North America; USA,,,9606,2021-09-25,PRJNA614995,SAMN23427251


In [4]:
df.columns

Index(['Nucleotide Accession', 'Species Taxonomy Id', 'Species Name',
       'Virus Genus', 'Virus Family', 'Isolate Name', 'Nucleotide Length',
       'Sequence Type', 'Nuc Completeness', 'Geo Location', 'US State',
       'Host Name', 'Host Taxonomy ID', 'Collection Date', 'BioProject',
       'BioSample'],
      dtype='object')

In [5]:
# replace spaces in column names with '_' for easy indexing
df.columns = [col.lower().replace(' ','_') for col in df.columns]
df.columns

Index(['nucleotide_accession', 'species_taxonomy_id', 'species_name',
       'virus_genus', 'virus_family', 'isolate_name', 'nucleotide_length',
       'sequence_type', 'nuc_completeness', 'geo_location', 'us_state',
       'host_name', 'host_taxonomy_id', 'collection_date', 'bioproject',
       'biosample'],
      dtype='object')

In [6]:
df.head(5)

Unnamed: 0,nucleotide_accession,species_taxonomy_id,species_name,virus_genus,virus_family,isolate_name,nucleotide_length,sequence_type,nuc_completeness,geo_location,us_state,host_name,host_taxonomy_id,collection_date,bioproject,biosample
0,NC_045512.2,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,Wuhan-Hu-1,29903,RefSeq,complete,Asia; China,,,9606,2019-12,PRJNA485481,
1,OM487257.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/USA/UT-UPHL-220129247957/2021,29815,GenBank,complete,North America; USA,,,9606,2021-12-23,PRJNA614995,SAMN25559487
2,OM403304.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/HKG/VM20001061-2/2020,29870,GenBank,complete,Asia; Hong Kong,,,9606,2020-01-23,,
3,OM403303.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/HKG/Original/2020,29851,GenBank,complete,Asia; Hong Kong,,,9606,2020-01-23,,
4,OM401120.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/USA/UT-UPHL-211112790633/2021,29712,GenBank,complete,North America; USA,,,9606,2021-09-25,PRJNA614995,SAMN23427251


In [7]:
# create a continent column to show the continent of a particular sample datapoint. 
# This is extracted from the 'geo_location' column. where we used the pandas 'str'
# method to enable the use of string functions like replace.
df['continent'] = df['geo_location'].str.replace(';.+', '', regex=True)

In [8]:
df['country'] = df['geo_location'].str.replace('.+; ', '', regex=True)

In [9]:
df.head()

Unnamed: 0,nucleotide_accession,species_taxonomy_id,species_name,virus_genus,virus_family,isolate_name,nucleotide_length,sequence_type,nuc_completeness,geo_location,us_state,host_name,host_taxonomy_id,collection_date,bioproject,biosample,continent,country
0,NC_045512.2,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,Wuhan-Hu-1,29903,RefSeq,complete,Asia; China,,,9606,2019-12,PRJNA485481,,Asia,China
1,OM487257.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/USA/UT-UPHL-220129247957/2021,29815,GenBank,complete,North America; USA,,,9606,2021-12-23,PRJNA614995,SAMN25559487,North America,USA
2,OM403304.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/HKG/VM20001061-2/2020,29870,GenBank,complete,Asia; Hong Kong,,,9606,2020-01-23,,,Asia,Hong Kong
3,OM403303.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/HKG/Original/2020,29851,GenBank,complete,Asia; Hong Kong,,,9606,2020-01-23,,,Asia,Hong Kong
4,OM401120.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/USA/UT-UPHL-211112790633/2021,29712,GenBank,complete,North America; USA,,,9606,2021-09-25,PRJNA614995,SAMN23427251,North America,USA


In [10]:
africa_genome = df.loc[(df["continent"]=='Africa')]
africa_genome

Unnamed: 0,nucleotide_accession,species_taxonomy_id,species_name,virus_genus,virus_family,isolate_name,nucleotide_length,sequence_type,nuc_completeness,geo_location,us_state,host_name,host_taxonomy_id,collection_date,bioproject,biosample,continent,country
36,OL840814.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/ETH/S13/2021,29817,GenBank,complete,Africa; Ethiopia: Addis Ababa,,,9606,2021-02-01,,,Africa,Ethiopia: Addis Ababa
51,OL601931.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/MLI/MAL0093/2021,29777,GenBank,complete,Africa; Mali: Bamako,,,9606,2021-01-29,,,Africa,Mali: Bamako
55,OK449157.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/KEN/K104846/2020,29782,GenBank,complete,Africa; Kenya,,,9606,2020-11-23,,,Africa,Kenya
100,MZ287363.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/UGA/MAKCHScov5/2020,29820,GenBank,complete,Africa; Uganda,,,9606,2020-09-30,,,Africa,Uganda
101,MZ287350.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/UGA/MAKCHScov6/2020,29814,GenBank,complete,Africa; Uganda,,,9606,2020-09-30,,,Africa,Uganda
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
842297,MZ149978.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/SLE/nmimr-SLE-SARS-CoV-2-SLSE...,29751,GenBank,complete,Africa; Sierra Leone,,,9606,2021-01-27,,,Africa,Sierra Leone
842729,MW598432.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/nmimr-SARS-CoV-2-NTRA-153...,29903,GenBank,complete,Africa; Ghana,,,9606,2020-12-27,,,Africa,Ghana
842730,MW598415.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/nmimr-SARS-CoV-2-TRA-188/...,29903,GenBank,complete,Africa; Ghana,,,9606,2021-01-12,,,Africa,Ghana
842731,MW598412.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/nmimr-SARS-CoV-2-TRA-182/...,29903,GenBank,complete,Africa; Ghana,,,9606,2021-01-12,,,Africa,Ghana


In [11]:
gh_genome = africa_genome.loc[(africa_genome["country"]=='Ghana')]
gh_genome

Unnamed: 0,nucleotide_accession,species_taxonomy_id,species_name,virus_genus,virus_family,isolate_name,nucleotide_length,sequence_type,nuc_completeness,geo_location,us_state,host_name,host_taxonomy_id,collection_date,bioproject,biosample,continent,country
125,MW523409.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/WACCBIP_nCoV_GS28/2020,29782,GenBank,complete,Africa; Ghana,,,9606,2020-07-21,,,Africa,Ghana
302,MT890243.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/84438_S50/2020,29863,GenBank,complete,Africa; Ghana,,,9606,2020-05-24,,,Africa,Ghana
303,MT890238.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/81216_S36/2020,29828,GenBank,complete,Africa; Ghana,,,9606,2020-05-20,,,Africa,Ghana
2526,MT890230.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/35563_S37/2020,29823,GenBank,complete,Africa; Ghana,,,9606,2020-05-24,,,Africa,Ghana
2532,MW598425.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/nmimr-SARS-CoV-2-NTRA-148...,29903,GenBank,complete,Africa; Ghana,,,9606,2020-12-23,,,Africa,Ghana
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824271,MW598407.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/nmimr-SARS-CoV-2-TRA-140/...,29903,GenBank,complete,Africa; Ghana,,,9606,2021-01-10,,,Africa,Ghana
842729,MW598432.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/nmimr-SARS-CoV-2-NTRA-153...,29903,GenBank,complete,Africa; Ghana,,,9606,2020-12-27,,,Africa,Ghana
842730,MW598415.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/nmimr-SARS-CoV-2-TRA-188/...,29903,GenBank,complete,Africa; Ghana,,,9606,2021-01-12,,,Africa,Ghana
842731,MW598412.1,2697049,Severe acute respiratory syndrome coronavirus 2,Betacoronavirus,Coronaviridae,SARS-CoV-2/human/GHA/nmimr-SARS-CoV-2-TRA-182/...,29903,GenBank,complete,Africa; Ghana,,,9606,2021-01-12,,,Africa,Ghana


In [15]:
africa_genome.country.value_counts().head()

Egypt           974
Djibouti        269
Nigeria: Oyo    232
Kenya           197
Ghana           185
Name: country, dtype: int64

In [None]:
# from Bio.PDB import *