In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from Bio import SeqIO
import os

In [None]:
DATASET_PATH = "Covid India Dataset/"
os.listdir(DATASET_PATH)

In [43]:
# open fasta files
fasta_sequences = []
for fasta in os.listdir(DATASET_PATH):
    if fasta.endswith(".fasta"):
        for seq_record in SeqIO.parse(DATASET_PATH + fasta, "fasta"):
            fasta_sequences.append(seq_record)



In [44]:
for seq_record in fasta_sequences:
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    print("")

OQ852573
Seq('AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTA...CCT')
29711

OQ852572
Seq('AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTA...AAA')
29730

OQ852571
Seq('ATTACTGGCTTATACCCAACAAACCAACCAACTTTTGATCTCTTGTAGATCTGT...TAT')
29730

OQ852570
Seq('CCCAGGTAACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACG...CAA')
29687

OQ852569
Seq('AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTA...ACG')
29699

OQ852568
Seq('AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTA...AAT')
29729

OQ852567
Seq('AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTA...CAA')
29719

OQ852566
Seq('AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTA...ATC')
29757

OQ852565
Seq('TAACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTT...ATT')
29714

OQ852564
Seq('ATTAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTTGATCTCTTGTA...ATT')
29738

OQ852563
Seq('AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTA...AAT')
29729

OQ852562
Seq('AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTA...GAA')

In [31]:
# print metadata
metadata = pd.read_csv(DATASET_PATH + "metadata.csv")
metadata.head()

Unnamed: 0,Virus Strain Name,Accession ID,Data Source,Related ID,Lineage,Nuc.Completeness,Sequence Length,Sequence Quality,Quality Assessment,Host,Sample Collection Date,Location,Originating Lab,Submission Date,Submitting Lab,Create Time,Last Update Time
0,SARS-CoV-2/human/IND/E/2022,OR357670.1,GenBank,,BA.2.75,Complete,29714,0,141/0/-1/-1/-1,Homo sapiens,04-01-2022,India / Uttar Pradesh / Noida,,28-07-2023,,Fri Aug 25 16:41:55 CST 2023,Fri Aug 25 16:41:55 CST 2023
1,SARS-CoV-2/human/IND/C/2022,OR357669.1,GenBank,,BA.2,Complete,29677,1,"13/0/4/72/10447~10449(3-2-0.67,SNP:10447; SNP:...",Homo sapiens,04-01-2022,India / Uttar Pradesh / Noida,,28-07-2023,,Fri Aug 25 16:41:55 CST 2023,Fri Aug 25 16:55:12 CST 2023
2,SARS-CoV-2/human/IND/B/2022,OR357668.1,GenBank,,BA.2,Complete,29712,1,"13/0/4/71/10447~10449(3-2-0.67,SNP:10447; SNP:...",Homo sapiens,04-01-2022,India / Uttar Pradesh / Noida,,28-07-2023,,Fri Aug 25 16:41:55 CST 2023,Fri Aug 25 16:55:12 CST 2023
3,SARS-CoV-2/human/IND/A/2022,OR357667.1,GenBank,,BA.2,Complete,29716,0,34/0/-1/-1/-1,Homo sapiens,04-01-2022,India / Uttar Pradesh / Noida,,28-07-2023,,Fri Aug 25 16:41:55 CST 2023,Fri Aug 25 16:41:55 CST 2023
4,SARS-CoV-2/human/IND/W/2022,OR357654.1,GenBank,,BA.5.1.30,Complete,29762,1,"2/0/5/74/10447~10449(3-2-0.67,SNP:10447; SNP:1...",Homo sapiens,20-07-2022,India / Uttar Pradesh / Noida,,28-07-2023,,Fri Aug 25 16:41:55 CST 2023,Fri Aug 25 16:55:12 CST 2023


In [32]:
# metadat should only contain accession id, related id and location
metadata = metadata[["Accession ID", "Related ID", "Location"]]
metadata.head()

Unnamed: 0,Accession ID,Related ID,Location
0,OR357670.1,,India / Uttar Pradesh / Noida
1,OR357669.1,,India / Uttar Pradesh / Noida
2,OR357668.1,,India / Uttar Pradesh / Noida
3,OR357667.1,,India / Uttar Pradesh / Noida
4,OR357654.1,,India / Uttar Pradesh / Noida


In [33]:
# if accession id starts with EPI_ISL_ then replace entire id with empty string
metadata["Accession ID"] = metadata["Accession ID"].apply(lambda x: "" if str(x).startswith("EPI_ISL_") else str(x))

# if related id starts with EPI_ISL_ then replace entire id with empty string
metadata["Related ID"] = metadata["Related ID"].apply(lambda x: "" if str(x).startswith("EPI_ISL_") else str(x))

In [35]:
# if accession id is empty then replace it with related id
metadata["Accession ID"] = metadata.apply(lambda x: x["Related ID"] if x["Accession ID"] == "" else x["Accession ID"], axis=1)

In [37]:
# remove related id column
metadata = metadata[["Accession ID", "Location"]]
metadata.head()

Unnamed: 0,Accession ID,Location
0,OR357670.1,India / Uttar Pradesh / Noida
1,OR357669.1,India / Uttar Pradesh / Noida
2,OR357668.1,India / Uttar Pradesh / Noida
3,OR357667.1,India / Uttar Pradesh / Noida
4,OR357654.1,India / Uttar Pradesh / Noida


In [38]:
# if accession id ends with .1 then remove .1
metadata["Accession ID"] = metadata["Accession ID"].apply(lambda x: x[:-2] if str(x).endswith(".1") else str(x))

In [40]:
# rename accession id column to id
metadata = metadata.rename(columns={"Accession ID": "ID"})

In [41]:
metadata.head()

Unnamed: 0,ID,Location
0,OR357670,India / Uttar Pradesh / Noida
1,OR357669,India / Uttar Pradesh / Noida
2,OR357668,India / Uttar Pradesh / Noida
3,OR357667,India / Uttar Pradesh / Noida
4,OR357654,India / Uttar Pradesh / Noida


In [47]:
fasta_seq = []
for seq_record in fasta_sequences:
    fasta_seq.append([str(seq_record.id), str(seq_record.seq)])

fasta_seq = pd.DataFrame(fasta_seq, columns=["ID", "Sequence"])
fasta_seq.head()

Unnamed: 0,ID,Sequence
0,OQ852573,AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAAC...
1,OQ852572,AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAAC...
2,OQ852571,ATTACTGGCTTATACCCAACAAACCAACCAACTTTTGATCTCTTGT...
3,OQ852570,CCCAGGTAACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTC...
4,OQ852569,AACAAACCAACCAACTTTTGATCTCTTGTAGATCTGTTCTCTAAAC...


In [48]:
# merge metadata and fasta_seq
merged = pd.merge(metadata, fasta_seq, on="ID")
merged.head()

Unnamed: 0,ID,Location,Sequence
0,MW881790,India,GGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTT...
1,MZ401494,India,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...
2,MZ702725,India,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...
3,MZ557829,India,ACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACT...
4,ON052753,India,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...


In [49]:
# save merged dataframe to csv
merged.to_csv(DATASET_PATH + "merged.csv", index=False)