In [None]:
import pandas as pd
from Bio import SeqIO
import os

In [None]:
DATASET_PATH = "Covid India Dataset/"
os.listdir(DATASET_PATH)

In [None]:
# open fasta files
fasta_sequences = []
for fasta in os.listdir(DATASET_PATH):
    if fasta.endswith(".fasta"):
        for seq_record in SeqIO.parse(DATASET_PATH + fasta, "fasta"):
            fasta_sequences.append(seq_record)



In [None]:
for seq_record in fasta_sequences:
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    print("")

In [None]:
# print metadata
metadata = pd.read_csv(DATASET_PATH + "metadata.csv")
metadata.head()

In [None]:
# metadat should only contain accession id, related id and location
metadata = metadata[["Accession ID", "Related ID", "Location"]]
metadata.head()

In [None]:
# if accession id starts with EPI_ISL_ then replace entire id with empty string
metadata["Accession ID"] = metadata["Accession ID"].apply(lambda x: "" if str(x).startswith("EPI_ISL_") else str(x))

# if related id starts with EPI_ISL_ then replace entire id with empty string
metadata["Related ID"] = metadata["Related ID"].apply(lambda x: "" if str(x).startswith("EPI_ISL_") else str(x))

In [None]:
# if accession id is empty then replace it with related id
metadata["Accession ID"] = metadata.apply(lambda x: x["Related ID"] if x["Accession ID"] == "" else x["Accession ID"], axis=1)

In [None]:
# remove related id column
metadata = metadata[["Accession ID", "Location"]]
metadata.head()

In [None]:
# if accession id ends with .1 then remove .1
metadata["Accession ID"] = metadata["Accession ID"].apply(lambda x: x[:-2] if str(x).endswith(".1") else str(x))

In [None]:
# rename accession id column to id
metadata = metadata.rename(columns={"Accession ID": "ID"})

In [None]:
metadata.head()

In [None]:
fasta_seq = []
for seq_record in fasta_sequences:
    fasta_seq.append([str(seq_record.id), str(seq_record.seq)])

fasta_seq = pd.DataFrame(fasta_seq, columns=["ID", "Sequence"])
fasta_seq.head()

In [None]:
# merge metadata and fasta_seq
merged = pd.merge(metadata, fasta_seq, on="ID")
merged.head()

In [None]:
# save merged dataframe to csv
merged.to_csv(DATASET_PATH + "merged.csv", index=False)