In [1]:
def read_fasta(fasta_file):
    from Bio import SeqIO
    fasta_sequences = SeqIO.parse(open(fasta_file, encoding="iso-8859-1"), 'fasta')
    return fasta_sequences

In [4]:
import pandas as pd

df = pd.read_csv("metadata/metadata_filtered.tsv", sep="\t")

In [29]:
omic_acc = 'Former VOC Omicron GRA (B.1.1.529+BA.*) first detected in Botswana/Hong Kong/South Africa'

# get accession ids for a specific variant
df_omic = df[df['Variant'] == omic_acc]
df_omic = df_omic[df_omic['Accession ID'].notnull()]
# keep rows only with collection date is newer than 2023-06-01
df_omic_new = df_omic[df_omic['Collection date'] > '2023-05-20']
# keep only the collection data is in yyyy-mm-dd format
df_omic_new = df_omic_new[df_omic_new['Collection date'].str.match(r'\d{4}-\d{2}-\d{2}')]
df_omic_new = df_omic_new.reset_index(drop=True)
df_omic_new

Unnamed: 0,Virus name,Accession ID,Variant,Pango lineage,Clade,Collection date,Submission date,Location,Host
0,hCoV-19/USA/CA-HLX-STM-2G38E65FT/2023,EPI_ISL_18260366,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,XBC.1.6.3,GRA,2023-08-23,2023-09-14,North America / USA / California,Human
1,hCoV-19/USA/CA-HLX-STM-F2E2UZDKH/2023,EPI_ISL_18260880,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,HW.1.1,GRA,2023-08-21,2023-09-14,North America / USA / California,Human
2,hCoV-19/Spain/AS-HUCA-232339160/2023,EPI_ISL_17949299,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,BA.2,GRA,2023-06-16,2023-06-29,Europe / Spain / Asturias,Human
3,hCoV-19/Japan/PG-586620/2023,EPI_ISL_18888903,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,HW.1,GRA,2023-10-13,2024-02-14,Asia / Japan / Wakayama,Human
4,hCoV-19/Japan/PG-593636/2023,EPI_ISL_18888929,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,XBC.1.3,GRA,2023-12-14,2024-02-14,Asia / Japan / Wakayama,Human
...,...,...,...,...,...,...,...,...,...
3637,hCoV-19/Israel/ICH-741198878/2023,EPI_ISL_18255299,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,HW.1.1,GRA,2023-09-04,2023-09-13,Asia / Israel,Human
3638,hCoV-19/Netherlands/ZH-EMC-8206/2023,EPI_ISL_18255778,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,HW.1.1,GRA,2023-09-02,2023-09-13,Europe / Netherlands / Zuid-Holland,Human
3639,hCoV-19/USA/TX-HMH-M-132614/2023,EPI_ISL_18256159,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,HW.1.1,GRA,2023-08-26,2023-09-13,North America / USA / Texas / Houston,Human
3640,hCoV-19/USA/TX-HMH-M-132276/2023,EPI_ISL_18256302,Former VOC Omicron GRA (B.1.1.529+BA.*) first ...,HW.1.1,GRA,2023-08-23,2023-09-13,North America / USA / Texas / Houston,Human


# VARIANT SELECTION (Omicron) Takes around 7 minutes

In [21]:
fseq_acc_filtered = read_fasta('spikeprot0508/spikeprot0508_acc_filtered.fasta')

In [25]:
# select the sequences with accession ids in the list
omic_new_accs = df_omic_new['Accession ID'].tolist()
fseq_omic = []

# show progress
from tqdm import tqdm
for record in tqdm(fseq_acc_filtered):
    if record.id in omic_new_accs:
        fseq_omic.append(record)


11445408it [06:45, 28204.75it/s]


In [26]:
print(len(fseq_omic))

1903


In [30]:
# write the spike protein sequences for a specific variant to a fasta file
with open('spikeprot0508/spikeprot0508_acc_filtered_omicron_new.fasta', 'w') as f:
    for record in fseq_omic:
        f.write('>' + record.id + '\n')
        f.write(str(record.seq) + '\n')

In [31]:
# get unique sequences for a specific variant
def get_unique_sequences(fasta_file):
    from Bio import SeqIO
    from tqdm.notebook import tqdm
    unique_sequences = []
    unique_ids = []
    for record in SeqIO.parse(open(fasta_file, encoding="iso-8859-1"), 'fasta'):
        if str(record.seq) not in unique_sequences:
            unique_sequences.append(str(record.seq))
            unique_ids.append(record.id)
    return unique_sequences, unique_ids

unique_sequences, unique_ids = get_unique_sequences('spikeprot0508/spikeprot0508_acc_filtered_omicron_new.fasta')

print(len(unique_sequences))

# write the unique sequences to a fasta file
with open('spikeprot0508/spikeprot0508_acc_filtered_omicron_new_unique.fasta', 'w') as f:
    for i, seq in enumerate(unique_sequences):
        f.write('>' + unique_ids[i] + '\n')
        f.write(seq + '\n')


548


In [1]:
# write to a csv file
import pandas as pd
from Bio import SeqIO

def write_to_csv(fasta_file, csv_file):
    with open(csv_file, 'w') as f:
        f.write('accession_id,sequence\n')
        for record in SeqIO.parse(open(fasta_file, encoding="iso-8859-1"), 'fasta'):
            f.write(record.id + ',' + str(record.seq) + '\n')

write_to_csv('spikeprot0508/spikeprot0508_acc_filtered_omicron_new_unique.fasta', 'spikeprot0508/unique_Omicron_548.csv')

# read the csv file
unique_Omicron_548 = pd.read_csv('spikeprot0508/unique_Omicron_548.csv')
unique_Omicron_548


Unnamed: 0,accession_id,sequence
0,EPI_ISL_17793872,MFVFLVLLPLVSSQCVNLITRTQLSPAYTNSFTRGVYYPDKVFRSS...
1,EPI_ISL_17796983,MFVFLVLLPLVSSQCVNFRTGTQLPPAYTNSFTRGVYYPDKVFRSS...
2,EPI_ISL_17803121,MFVFLVLLPLVSSQCVNLITRTQLSPAYTNSFTRGVYYPDKVFRSS...
3,EPI_ISL_17803165,MFVFLVLLPLVSSQCVNLITRTQLSPAYTNSFTRGVYYPDKVFRSS...
4,EPI_ISL_17803190,MFVFLVLLPLVSSQCVNLITRTQSYTNSFTRGVYYPDKVFRSSVLH...
...,...,...
543,EPI_ISL_19084465,MFVFLVLLPLVSSQCVNLITTTQXXXAYTNSFTRGVYYPDKVFRSS...
544,EPI_ISL_19084464,MFVFLVLLPLVSSQCVNLITTTQXXXXYTNSFTRGVYYPDKVFRSS...
545,EPI_ISL_19084596,MFVFLVLLPLVSSQCVNLITRTQLSPAYTNSFTRGVYYPDKVFRSS...
546,EPI_ISL_18229220,MFVFLVLLPLVSSQCVNLITRTQLSPAYTNSFTRGVYYPDKVFRSS...
