### Input data

1. Download 'CoV-AbDab_080224.csv' from this [link](https://opig.stats.ox.ac.uk/webapps/covabdab/static/downloads/CoV-AbDab_080224.csv). 
2. Use the 'covid_variants.fasta' file from this directory (originally from [here](https://viralzone.expasy.org/9556))

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO

full_df = pd.read_csv('CoV-AbDab_080224.csv')
ab_df = full_df[full_df['VHorVHH'] != 'ND']
ab_df = full_df[full_df['VL'] != 'ND']
ab_df = ab_df[ab_df['Ab or Nb'] == 'Ab']
ab_df = ab_df[['Neutralising Vs', 'Not Neutralising Vs', 'VHorVHH', 'VL', 'Origin', 'Protein + Epitope', 'Name']]
ab_df = ab_df.fillna('')
ab_df.reset_index(inplace=True)

In [74]:
sources_to_keep = ['B-cells; SARS-CoV2_WT Convalescent Patient (Unvaccinated)', 
                         'B-cells; SARS-CoV1 Human Patient; SARS-CoV2 Vaccinee',
                         'B-cells; SARS-CoV2_WT Convalescent Patients',
                         'B-cells; SARS-CoV2_WT Vaccinee (BBIBP-CoV)',
                         'B-cells; SARS-CoV2_WT Vaccinee',
                         'B-cells; SARS-CoV2_WT Human Patient',
                         'B-cells; Unvaccinated SARS-CoV2_WT Human Patient',
                         'B-cells; SARS-CoV2_Gamma Human Patient',
                         'B-cells; SARS-CoV1 Human Patient',
                         'B-cells (SARS-CoV2_Beta Human Patient)'
                        ]

binding_to_keep = ["S; RBD", "S: RBD", "S; RBD/NTD"]

In [75]:
source_df = ab_df[ab_df['Origin'].isin(sources_to_keep)]
source_df = source_df[source_df['Protein + Epitope'].isin(binding_to_keep)]

In [77]:
heavy_chains = []
light_chains = []
antigens = []
target = []
names = []
origins = []

for i,row in source_df.iterrows():
    neut_ags = row['Neutralising Vs'].split(';')
    no_neut_ags = row['Not Neutralising Vs'].split(';')
    name = row['Name']
    origin = row['Origin']

    hc = row['VHorVHH']
    lc = row['VL']

    for n in neut_ags:
        if n != '':
            heavy_chains.append(hc)
            light_chains.append(lc)
            antigens.append(n)
            target.append(1)
            names.append(name)
            origins.append(origin)

    for n in no_neut_ags:
        if n != '':
            heavy_chains.append(hc)
            light_chains.append(lc)
            antigens.append(n)
            target.append(0)
            names.append(name)
            origins.append(origin)

interaction_df = pd.DataFrame({'names': names,
                               'origins': origins, 
                               'heavy': heavy_chains, 
                              'light': light_chains,
                              'antigens': antigens,
                              'target': target})    

In [78]:
grouping_dict = {
    'SARS-CoV2_WT': ['SARS-CoV2_WT', 'SARS-CoV2_WT (weak)', 'SARS-CoV2_WT and SARS-CoV1', 'SARS-CoV2_WT, SARS-CoV2_Delta', 'SARS-CoV2_WT_Delta (weak)', 'SARS-CoV2_WT (weak) , SARS-CoV2_Delta (weak)'],
    'SARS-CoV2_Alpha': ['SARS-CoV2_Alpha', 'SARS-CoV2_Alpha (weak)'],
    'SARS-CoV2_Beta': ['SARS-CoV2_Beta', 'SARS-CoV2_Beta (weak)', 'SRAS-CoV2_Beta'],
    'SARS-CoV2_Delta': ['SARS-CoV2_Delta', 'SARS-CoV2_Delta (weak)'],
    'SARS-CoV2_Epsilon': ['SARS-CoV2_Epsilon', 'SARS-CoV2_Epsilon (weak)'],
    'SARS-CoV2_Gamma': ['SARS_CoV2_Gamma', 'SARS-CoV2_Gamma', 'SARS-CoV2_Gamma (weak)'],
    'SARS-CoV2_Eta': ['SARS-CoV2_Eta', 'SARS-CoV2_Eta (weak)'],
    'SARS-CoV2_Iota': ['SARS-CoV2_Iota', 'SARS-CoV2_Iota (weak)'],
    'SARS-CoV2_Lambda': ['SARS-CoV2_Lambda', 'SARS-CoV2_Lambda (weak)'],
    'SARS-CoV2_Kappa': ['SARS-CoV2_Kappa', 'SARS-CoV2_Kappa (weak)'],
    'SARS-CoV2_Omicron-BA1': ['SARS-CoV2_Omicron-BA1', 'SARS-CoV2_Omicron-BA1 (weak)', 'SARS-CoV2_Omicron_BA1', 'SARS-CoV2_Omicron_BA1.1', 'SARS-CoV2_Omicron-BA1.1 (weak)'],
    'SARS-CoV2_Omicron-BA2': ['SARS-CoV2_Omicron-BA2', 'SARS-CoV2_Omicron-BA2 (weak)', 'SARS_COV2_Omicron-BA2', 'SARS-CoV2_Omicron-BA2.12.1', 'SARS-CoV2_Omicron-BA2.12.1 (weak)', 'SARS-CoV2_Omicron-BA2.75', 'SARS-CoV2_Omicron-BA2.38', 'SARS-CoV2_Omicron-BA2.38 (weak)', 'SARS-CoV2_Omicron-BA2.75.1', 'SARS-CoV2_Omicron-BA2.75.5', 'SARS-CoV2_Omicron-BA2.75.5 (weak)'],
    'SARS-CoV2_Omicron-BA4': ['SARS-CoV2_Omicron-BA4', 'SARS-CoV2_Omicron-BA4 (weak)', 'SARS-CoV2_Omicron-BA4/BA', 'SARS-CoV2_Omicron-BA4.6', 'SARS-CoV2_Omicron-BA4.6 (weak)', 'SARS-CoV2_Omicron-BA4.7', 'SARS-CoV2_Omicron-BA4.7 (weak)'],
    'SARS-CoV2_Omicron-BA5': ['SARS-CoV2_Omicron-BA5', 'SARS-CoV2_Omicron-BA5 (weak)', 'SARS-CoV2_Omicron-BA5.9', 'SARS-CoV2_Omicron-BA5.9 (weak)'],
    'SARS-CoV2_Omicron-XBB': ['SARS-CoV2_Omicron-XBB']
}

test_groups = ['SARS-CoV2_Omicron-BA1', 'SARS-CoV2_Omicron-BA2', 'SARS-CoV2_Omicron-BA4', 
               'SARS-CoV2_Omicron-BA5']

reversed_dict = {}
for key, value_list in grouping_dict.items():
    for value in value_list:
        reversed_dict[value] = key

In [79]:
interaction_df['groups'] = interaction_df['antigens'].apply(lambda x: reversed_dict.get(x, pd.NA))

In [80]:
interaction_df = interaction_df.dropna()

In [81]:
records = list(SeqIO.parse("covid_variants.fasta", "fasta"))

In [82]:
variant_dict = {rec.id:str(rec.seq) for rec in records}

In [83]:
interaction_df['covid_seq'] = interaction_df['groups'].apply(lambda x:variant_dict[x])

In [84]:
interaction_df = interaction_df.drop_duplicates(subset=['names', 'groups'])

In [85]:
train_interaction_df = interaction_df[~interaction_df['groups'].isin(test_groups)]
test_interaction_df = interaction_df[interaction_df['groups'].isin(test_groups)]

In [86]:
train_interaction_df.to_csv('processed_data_train.csv')

In [87]:
test_interaction_df.to_csv('processed_data_test.csv')