<a href="https://colab.research.google.com/github/rvanasa/deep-antibody/blob/master/antibody_data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Workspace setup

!pip install -q biopython pdb-tools
!wget -nc ftp://ftp.cmbi.ru.nl/pub/software/dssp/dssp-2.0.4-linux-amd64 -O /usr/local/bin/dssp && chmod +x /usr/local/bin/dssp

from IPython.display import clear_output, display
clear_output()

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import Bio
import Bio.PDB
from Bio.PDB import DSSP

contact_buffer = 4
contact_window_size = contact_buffer * 2 + 1

amino_acids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL', '???']
oneletters = 'ARNDCEQGHILKMFPSTWYV'
ssletters = 'HBEGITS'

parser = Bio.PDB.PDBParser(QUIET=True)

def parse(ident, cache_dir=None):
  cache_dir = cache_dir.rstrip('/') + '/' if cache_dir else ''
  filename = ident if ident.endswith('.pdb') else f'{cache_dir}{ident}.pdb'
  if '.' not in ident and not os.path.exists(filename):
    !wget -nc https://files.rcsb.org/download/{ident}.pdb
    !mv {ident}.pdb {cache_dir}
  return parser.get_structure(ident, filename)


def run_dssp(filename, cache_dir=None):
  structure = parse(filename, cache_dir)
  adfs = []
  for model in structure.get_models():
    index_lookup = {(chain.id, res.id): i for chain in model.get_chains() for i, res in enumerate(trim_residues(chain.get_residues()))}

    dssp = DSSP(model, filename)
    rows = [(key, index_lookup[key, res_id], *v[1:]) for (key, res_id), v in dssp.property_dict.items()]

    dfs = pd.DataFrame(rows, columns=[
        'Key', 'Index', 'Residue',
        'SS', 'ASA', 'Phi', 'Psi',
        'NH->O_1_relidx', 'NH->O_1_energy',
        'O->NH_1_relidx', 'O->NH_1_energy',
        'NH->O_2_relidx', 'NH->O_2_energy',
        'O->NH_2_relidx', 'O->NH_2_energy'])
    
    dfs.insert(0, 'Model', model.id)
    adfs.append(dfs)
  return pd.concat(adfs)


def create_seq(rs):
  return ''.join(oneletters[amino_acids.index(r)] if r in amino_acids else 'X' for r in rs)


def trim_residues(rs):
  rs = [r for r in rs if r.resname != 'HOH']
  while rs and rs[0].resname not in amino_acids:
    rs = rs[1:]
  while rs and rs[-1].resname not in amino_acids:
    rs = rs[:-1]
  return rs


def cmd(command):
  if not isinstance(command, str):
    for c in command:
      cmd(c)
  elif os.system(command):
    raise Exception(f'Non-zero exit code in command: $ {command}')

In [None]:
#@title SAbDab download

!wget -nc -O sab_summary_all.tsv http://opig.stats.ox.ac.uk/webapps/newsabdab/sabdab/summary/all/
!wget -nc -O sab_all.zip http://opig.stats.ox.ac.uk/webapps/newsabdab/sabdab/archive/all/
!unzip -q -n sab_all.zip
!rm -rf __MACOSX
clear_output()

pdb_dirname = './all_structures/raw'
pdb_files = sorted(os.listdir(pdb_dirname))

dfm = pd.read_csv('sab_summary_all.tsv', sep='\t')
dfm['file'] = dfm.pdb + '.pdb'

In [None]:
#@title Chain parsing

data = []

all_pdb_names = dfm.pdb.sort_values().unique()

for i, name in enumerate(all_pdb_names):
  file = f'{name}.pdb'
  if i % 500 == 0:
    clear_output()
    print(name)

  structure = parse(name, pdb_dirname)
  for model in structure.get_models():
    chains = list(model.get_chains())

    for chain in chains:
      data.append({
          'File': file,
          'Model': model.id,
          'Key': chain.id,
          'Sequence': ','.join(r.resname for r in chain if r.resname != 'HOH'),
      })

    # print(file, [{chain.id: len(chain)} for chain in chains])

clear_output()
df = pd.DataFrame(data)
df = df.sort_values(['File', 'Model', 'Key'])
df['Compact'] = df.Sequence.map(lambda rs: create_seq(rs.split(',')))
df[['File', 'Model', 'Key', 'Sequence']].to_csv('docked_seqs.csv', index=False)
df[['File', 'Model', 'Key', 'Compact']].to_csv('docked_oneletters.csv', index=False)
df

Unnamed: 0,File,Model,Key,Sequence,Compact
1,12e8.pdb,0,H,"GLU,VAL,GLN,LEU,GLN,GLN,SER,GLY,ALA,GLU,VAL,VA...",EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...
0,12e8.pdb,0,L,"ASP,ILE,VAL,MET,THR,GLN,SER,GLN,LYS,PHE,MET,SE...",DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...
2,12e8.pdb,0,M,"ASP,ILE,VAL,MET,THR,GLN,SER,GLN,LYS,PHE,MET,SE...",DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...
3,12e8.pdb,0,P,"GLU,VAL,GLN,LEU,GLN,GLN,SER,GLY,ALA,GLU,VAL,VA...",EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...
5,15c8.pdb,0,H,"GLU,VAL,GLN,LEU,GLN,GLN,SER,GLY,ALA,GLU,LEU,VA...",EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...
...,...,...,...,...,...
20039,7fab.pdb,0,L,"ALA,SER,VAL,LEU,THR,GLN,PRO,PRO,SER,VAL,SER,GL...",ASVLTQPPSVSGAPGQRVTISCTGSSSNIGAGHNVKWYQQLPGTAP...
20041,8fab.pdb,0,A,"GLU,LEU,THR,GLN,PRO,PRO,SER,VAL,SER,VAL,SER,PR...",ELTQPPSVSVSPGQTARITCSANALPNQYAYWYQQKPGRAPVMVIY...
20042,8fab.pdb,0,B,"ALA,VAL,LYS,LEU,VAL,GLN,ALA,GLY,GLY,GLY,VAL,VA...",AVKLVQAGGGVVQPGRSLRLSCIASGFTFSNYGMHWVRQAPGKGLE...
20043,8fab.pdb,0,C,"GLU,LEU,THR,GLN,PRO,PRO,SER,VAL,SER,VAL,SER,PR...",ELTQPPSVSVSPGQTARITCSANALPNQYAYWYQQKPGRAPVMVIY...


In [None]:
#@title Chain preprocessing

dfc = pd.read_csv('docked_oneletters.csv')
dfc.Compact = dfc.Compact.str.strip('X')
dfc = dfc[~dfc.Compact.str.contains('X')]
dfc = dfc.dropna()

data = []
for i, row in dfm.iterrows():
  if not pd.notnull(row.antigen_chain):
    continue

  dfc_rel = dfc[(dfc.File == row.file) & (dfc.Model == row.model)].set_index('Key')

  achains = [ak.strip() or '-' for ak in row.antigen_chain.split('|')]

  lk = row.Lchain if pd.notnull(row.Lchain) else '-'
  hk = row.Hchain if pd.notnull(row.Hchain) else'-'

  if hk == lk or (hk == '-' and lk == '-'):
    continue

  L = dfc_rel.loc[lk].Compact if lk in dfc_rel.index else None
  H = dfc_rel.loc[hk].Compact if hk in dfc_rel.index else None

  for ak in achains:
    if hk == ak or lk == ak:
      ak = '-'

    A = dfc_rel.loc[ak].Compact if ak in dfc_rel.index else None

    data.append(dict(
        File=row.file,
        Model=row.model,
        LKey=lk,
        HKey=hk,
        AKey=ak,
        Light=L or '-',
        Heavy=H or '-',
        Antigen=A or '-',
        Resolution=row.resolution,
    ))

clear_output()
df = pd.DataFrame(data)
df.Resolution = pd.to_numeric(df.Resolution, 'coerce')
df = df.dropna(subset=set(df.columns) - {'Resolution'})
df = df.sort_values('Resolution')
df = df.sort_values(['File', 'Model'])
df.to_csv('docked_preprocessed.csv', index=False)
df = df.drop_duplicates(['Light', 'Heavy', 'Antigen'])
df.to_csv('docked_preprocessed_unique.csv', index=False)
df

Unnamed: 0,File,Model,LKey,HKey,AKey,Light,Heavy,Antigen,Resolution
5416,1a14.pdb,0,L,H,N,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,2.50
1565,1a2y.pdb,0,A,B,C,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,KVFGRCELAAAMKRHGLANYRGYSLGNWVCAAKFESNFNTQATNRN...,1.50
2585,1a3l.pdb,0,L,H,-,DIVLTQAAFSNPVTLGASASISCRSSKSLLNSNGIIHMYWYLQKPG...,EVQLEESGPELVRPGTSVKISCKASGYTFTNYWLGWVKQRPGHGFE...,-,1.95
3818,1a3r.pdb,0,L,H,P,DIVMTQSPSSLTVTTGEKVTMTCKSSQSLLNSRTQKNYLTWYQQKP...,VQLQQSGAELVRPGASVKLSCTTSGFNIKDIYIHWVKQRPEQGLEW...,VKAETRLNPDLQPTE,2.10
5363,1a4k.pdb,0,A,B,-,ELVMTQTPLSLPVSLGDQASISCRSSQSLLHSNGNTYLHWYLQKPG...,QVQLLESGPELKKPGETVKISCKASGYTFTNYGMNWVKQAPGKGLK...,-,2.40
...,...,...,...,...,...,...,...,...,...
3773,7bu8.pdb,0,L,H,A,DIQLTQSPSSLSASVGDRVTFTCQASQDIRKYLNWYQQKPGKAPKL...,EVQLVQSGPDVEKPGASVKVSCKASGYTFTSNYIHWVRQAPGQGLE...,IRCIGVSNRDFVEGMSGGTWVDVVLEHGGCVTVMAQDKPTVDIELV...,3.80
3298,7bue.pdb,0,I,G,B,DIQLTQSPSSLSASVGDRVTFTCQASQDIRKYLNWYQQKPGKAPKL...,EVQLVQSGPDVEKPGASVKVSCKASGYTFTSNYIHWVRQAPGQGLE...,MRCIGISNRDFVEGVSGGSWVDIVLEHGSCVTTMAKNKPTLDFELI...,7.80
5337,7buf.pdb,0,L,H,A,DIQLTQSPSSLSASVGDRVTFTCQASQDIRKYLNWYQQKPGKAPKL...,EVQLVQSGPDVEKPGASVKVSCKASGYTFTSNYIHWVRQAPGQGLE...,MRCIGISNRDFVEGVSGGSWVDIVLEHGSCVTTMAKNKPTLDFELI...,6.10
2469,7bz5.pdb,0,L,H,A,DDIVMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPK...,DEVQLVESGGGLVQPGGSLRLSCAASGFIVSSNYMSWVRQAPGKGL...,NLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKC...,1.84


In [None]:
#@title Contact point calculation

import numpy.linalg as lin

df = pd.read_csv('docked_preprocessed.csv')

residue_contact_margin = 5.5 # Empirical

contact_dist_threshold = 5

data = []
for file, dff in df.groupby('File'):
  
  structure = parse(file.replace('.pdb', ''), pdb_dirname)
  models = {model.id: model for model in structure.get_models()}
  
  for i, row in dff.iterrows():
    hk, lk, ak = row.HKey, row.LKey, row.AKey

    model = models[row.Model]
    chain_map = {chain.id: chain for chain in model.get_chains()}

    print('>>', file, model.id, hk, lk, ak, list(chain_map))

    assert hk != lk != ak or (hk != lk and ak == '-')

    H = chain_map.get(hk)
    L = chain_map.get(lk)
    A = chain_map.get(ak)
    
    ca = A
    if ca is None:
      continue

    a_res = trim_residues(ca)
    if not a_res:
      continue

    a_coords = [np.array([a.coord for a in r]) for r in a_res]
    a_centers = np.array([a.mean(axis=0) for a in a_coords])

    ct = 0
    for btype, cb in (('H', H), ('L', L)):
      if cb is None:
        continue
      
      b_res = trim_residues(cb)
      b_coords = [np.array([a.coord for a in r]) for r in b_res]
      b_centers = np.array([a.mean(axis=0) for a in b_coords])

      norms = lin.norm(a_centers[:, None] - b_centers, axis=2)
      locs = np.argwhere(norms <= contact_dist_threshold + residue_contact_margin * 2)
      if len(locs):
        for an, bn in locs:
          min_dist = np.min(lin.norm(a_coords[an][:, None] - b_coords[bn], axis=2))
          if min_dist <= contact_dist_threshold:
            data.append({
                'File': file,
                'Model': model.id,
                'BType': btype,
                'BKey': cb.id,
                'BIndex': bn,
                'BResidue': b_res[bn].resname,
                'AKey': ca.id,
                'AIndex': an,
                'AResidue': a_res[an].resname,
                'Distance': min_dist,
            })
            ct += 1

      print(model.id, ct)

clear_output()
dfx = pd.DataFrame(data)
dfx.round(4).to_csv('chain_contacts.csv', index=False)
dfx

Unnamed: 0,File,Model,BType,BKey,BIndex,BResidue,AKey,AIndex,AResidue,Distance
0,1a14.pdb,0,H,H,104,TYR,N,248,ASN,3.398412
1,1a14.pdb,0,H,H,105,ASP,N,248,ASN,3.977821
2,1a14.pdb,0,H,H,102,TYR,N,284,ILE,3.440603
3,1a14.pdb,0,H,H,56,ASP,N,285,SER,4.831976
4,1a14.pdb,0,H,H,104,TYR,N,285,SER,4.436266
...,...,...,...,...,...,...,...,...,...,...
253783,7c01.pdb,0,L,L,91,TYR,A,170,VAL,4.942960
253784,7c01.pdb,0,L,L,91,TYR,A,171,GLY,4.022808
253785,7c01.pdb,0,L,L,29,SER,A,172,TYR,3.443813
253786,7c01.pdb,0,L,L,31,TYR,A,172,TYR,2.392346


In [None]:
#@title Contact point preprocessing

# dfo = pd.read_csv('docked_oneletters.csv')
dfp = pd.read_csv('docked_preprocessed.csv')
dfx = pd.read_csv('chain_contacts.csv')

df = dfx.merge(dfp, on=['File', 'Model', 'AKey'])

# for col in ('Antigen', 'Heavy', 'Light'):
#   ckey = col[0] + 'Key'
#   df = df.drop(columns=col).merge(dfo.rename(columns={'Key': ckey}), on=['File', 'Model', ckey]).rename(columns={'Compact': col})
#   df = df[~df[col].str.startswith('X')]##

df['BWindow'] = df.apply(
    lambda x: (x.Heavy if x.BType == 'H' else x.Light)[x.BIndex - contact_buffer:x.BIndex + contact_buffer + 1],
    axis=1)
df['AWindow'] = df.apply(
    lambda x: x.Antigen[x.AIndex - contact_buffer:x.AIndex + contact_buffer + 1],
    axis=1)

dfs = df.drop(columns=['Antigen', 'Light', 'Heavy'])
dfs = dfs.dropna()
dfs.to_csv('contacts_preprocessed.csv', index=False)
dfs

Unnamed: 0,File,Model,BType,BKey,BIndex,BResidue,AKey,AIndex,AResidue,Distance,LKey,HKey,Resolution,BWindow,AWindow
0,1a14.pdb,0,H,H,104,TYR,N,248,ASN,3.3984,L,H,2.50,GSYRYDGGF,NPRPNDPTV
1,1a14.pdb,0,H,H,105,ASP,N,248,ASN,3.9778,L,H,2.50,SYRYDGGFD,NPRPNDPTV
2,1a14.pdb,0,H,H,102,TYR,N,284,ILE,3.4406,L,H,2.50,SGGSYRYDG,LGRTISIAS
3,1a14.pdb,0,H,H,56,ASP,N,285,SER,4.8320,L,H,2.50,PGNGDTSYN,GRTISIASR
4,1a14.pdb,0,H,H,104,TYR,N,285,SER,4.4363,L,H,2.50,GSYRYDGGF,GRTISIASR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281839,7c01.pdb,0,L,L,91,TYR,A,170,VAL,4.9430,L,H,2.88,CQQSYSTPP,PTNGVGYQP
281840,7c01.pdb,0,L,L,91,TYR,A,171,GLY,4.0228,L,H,2.88,CQQSYSTPP,TNGVGYQPY
281841,7c01.pdb,0,L,L,29,SER,A,172,TYR,3.4438,L,H,2.88,SQSISRYLN,NGVGYQPYR
281842,7c01.pdb,0,L,L,31,TYR,A,172,TYR,2.3923,L,H,2.88,SISRYLNWY,NGVGYQPYR


In [None]:
#@title Contact point refinement

df = pd.read_csv('contacts_preprocessed.csv')
df = df[(df.BWindow.str.len() == contact_window_size) & (df.AWindow.str.len() == contact_window_size)]
df = df[~df.BWindow.str.contains('X') & ~df.AWindow.str.contains('X')]
df.to_csv('contacts_filtered.csv', index=False)
df

Unnamed: 0,File,Model,BType,BKey,BIndex,BResidue,AKey,AIndex,AResidue,Distance,LKey,HKey,Resolution,BWindow,AWindow
0,1a14.pdb,0,H,H,104,TYR,N,248,ASN,3.3984,L,H,2.50,GSYRYDGGF,NPRPNDPTV
1,1a14.pdb,0,H,H,105,ASP,N,248,ASN,3.9778,L,H,2.50,SYRYDGGFD,NPRPNDPTV
2,1a14.pdb,0,H,H,102,TYR,N,284,ILE,3.4406,L,H,2.50,SGGSYRYDG,LGRTISIAS
3,1a14.pdb,0,H,H,56,ASP,N,285,SER,4.8320,L,H,2.50,PGNGDTSYN,GRTISIASR
4,1a14.pdb,0,H,H,104,TYR,N,285,SER,4.4363,L,H,2.50,GSYRYDGGF,GRTISIASR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274663,7c01.pdb,0,L,L,91,TYR,A,170,VAL,4.9430,L,H,2.88,CQQSYSTPP,PTNGVGYQP
274664,7c01.pdb,0,L,L,91,TYR,A,171,GLY,4.0228,L,H,2.88,CQQSYSTPP,TNGVGYQPY
274665,7c01.pdb,0,L,L,29,SER,A,172,TYR,3.4438,L,H,2.88,SQSISRYLN,NGVGYQPYR
274666,7c01.pdb,0,L,L,31,TYR,A,172,TYR,2.3923,L,H,2.88,SISRYLNWY,NGVGYQPYR


In [None]:
#@title Secondary structure calculation

df = pd.read_csv('docked_preprocessed.csv')

p_dfs = None
files = sorted(set(df.File))
for i, file in enumerate(files):
  if (i + 1) % 1000 == 0:
    clear_output()
  print(file)
  try:
    dfs = run_dssp(f'{pdb_dirname}/{file}', pdb_dirname)
    dfs.insert(0, 'File', file)
    p_dfs = p_dfs.append(dfs) if p_dfs is not None else dfs
  except Exception as e:
    print(e)

clear_output()
p_dfs.to_csv('dssp_residues.csv', index=False)
p_dfs

Unnamed: 0,File,Model,Key,Index,Residue,SS,ASA,Phi,Psi,NH->O_1_relidx,NH->O_1_energy,O->NH_1_relidx,O->NH_1_energy,NH->O_2_relidx,NH->O_2_energy,O->NH_2_relidx,O->NH_2_energy
0,1a14.pdb,0,N,0,R,-,0.677419,360.0,-133.1,0,0.0,2,-0.4,0,0.0,106,-0.2
1,1a14.pdb,0,N,1,D,-,0.754601,-137.6,137.7,104,-0.3,153,-0.1,105,-0.0,152,-0.1
2,1a14.pdb,0,N,2,F,-,0.137056,-36.5,126.6,-2,-0.4,2,-0.3,1,-0.1,103,-0.3
3,1a14.pdb,0,N,3,N,-,0.191083,-83.1,142.4,101,-2.0,150,-1.6,149,-0.0,2,-0.5
4,1a14.pdb,0,N,4,N,-,0.324841,-120.8,123.9,-2,-0.3,2,-1.7,148,-0.2,3,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1251,7c01.pdb,0,D,210,F,E,0.035533,-144.4,167.0,-17,-2.0,-17,-2.1,-2,-0.3,2,-0.5
1252,7c01.pdb,0,D,211,N,E,0.503185,-118.1,128.3,-2,-0.3,3,-2.1,-19,-0.2,-19,-0.2
1253,7c01.pdb,0,D,212,R,T,0.205645,-61.1,125.3,-21,-2.5,-20,-0.0,-2,-0.5,-24,-0.0
1254,7c01.pdb,0,D,213,G,T,0.857143,93.8,6.5,-2,-0.1,-1,-0.3,0,0.0,-22,-0.0


In [None]:
#@title Primary / secondary structure coalescence

dfs = pd.read_csv('dssp_residues.csv')

dfd = pd.read_csv('docked_oneletters.csv')
dfd['Compact'] = dfd.Compact.str.strip('X')
dfd = dfd[~dfd.Compact.str.contains('X')]
# dfd = dfd.set_index(['File', 'Model', 'Key'])

nskipped = 0
data = []
for (file, mid, key), secondary in dfs.groupby(['File', 'Model', 'Key']):
  secondary = secondary.set_index('Index')

  # print(file)##

  dfdr = dfd[(dfd.File == file) & (dfd.Model == mid) & (dfd.Key == key)]
  if len(dfdr) != 1:
    print(f'{len(dfdr)} candidates for {key} in ({file} {mid})')
    nskipped += 1
    continue

  seq = dfdr.iloc[0].Compact.strip('X')
  data.append(dict(
      File=file,
      Model=mid,
      Key=key,
      Compact=seq,
      SS='|'.join(secondary.loc[i, 'SS'] if i in secondary.index else '-' for i in range(len(seq))),
      ASA='|'.join(str(round(secondary.loc[i, 'ASA'], 6)) if i in secondary.index else '' for i in range(len(seq))),
      Phi='|'.join(str(round(secondary.loc[i, 'Phi'], 6)) if i in secondary.index else '' for i in range(len(seq))),
      Psi='|'.join(str(round(secondary.loc[i, 'Psi'], 6)) if i in secondary.index else '' for i in range(len(seq))),
  ))

clear_output()
print('Skipped:', nskipped)
df = pd.DataFrame(data)
df.to_csv('docked_secondary.csv', index=False)
df

Skipped: 17


Unnamed: 0,File,Model,Key,Compact,SS,ASA,Phi,Psi
0,1a14.pdb,0,H,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,-|-|E|E|E|E|-|-|-|E|E|-|-|T|T|-|-|-|E|E|E|E|E|...,0.853535|0.15493|0.419192|0.067073|0.40404|0.1...,360.0|-85.3|-148.2|-125.6|-79.4|-83.4|-85.2|-9...,179.7|146.0|144.9|88.3|99.0|173.8|-173.8|169.8...
1,1a14.pdb,0,L,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,-|-|-|E|E|E|E|-|S|B|-|-|-|-|T|T|-|-|E|E|E|E|E|...,0.95092|0.023669|0.654639|0.042683|0.352113|0....,360.0|-81.9|-82.2|-104.5|-101.4|-76.1|-64.4|75...,-38.4|105.2|133.6|116.1|92.6|91.7|145.6|87.3|-...
2,1a14.pdb,0,N,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,-|-|-|-|-|-|-|S|-|B|-|-|-|-|E|E|E|E|E|E|E|-|-|...,0.677419|0.754601|0.137056|0.191083|0.324841|0...,360.0|-137.6|-36.5|-83.1|-120.8|-65.9|-103.0|-...,-133.1|137.7|126.6|142.4|123.9|88.3|-16.5|159....
3,1a2y.pdb,0,A,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,-|-|-|E|E|E|E|-|S|E|E|E|E|-|T|T|-|-|E|E|E|E|E|...,0.785276|0.201183|0.598592|0.079268|0.697183|0...,360.0|-84.7|-99.2|-111.0|-109.2|-103.7|-148.6|...,159.1|116.6|146.1|129.4|110.0|134.5|148.3|169....
4,1a2y.pdb,0,B,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,-|-|E|E|E|E|E|-|-|S|E|E|-|T|T|S|-|E|E|E|E|E|E|...,0.873737|0.204225|0.479798|0.079268|0.606061|0...,360.0|-76.6|-136.9|-138.1|-142.3|-90.6|-139.8|...,120.9|124.8|147.3|136.7|126.7|149.2|125.1|-179...
...,...,...,...,...,...,...,...,...
13600,7c01.pdb,0,B,TNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFK...,-|-|-|-|-|H|H|H|H|H|T|-|S|S|-|-|B|G|G|G|-|E|E|...,1.0|0.796178|0.682927|0.007407|0.375|0.086294|...,360.0|-63.1|-101.2|-65.6|-78.7|-74.9|-63.2|-70...,74.6|123.9|139.4|141.7|67.4|-25.5|-24.3|-32.6|...
13601,7c01.pdb,0,C,EVQLVESGGGLVQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLE...,-|-|E|E|E|E|E|-|-|-|E|E|-|T|T|-|-|E|E|E|E|E|E|...,0.865979|0.15493|0.525253|0.006098|0.598592|0....,360.0|-127.8|-150.6|-114.5|-119.0|-114.8|-151....,159.3|138.6|136.4|121.1|128.4|151.0|-179.2|-13...
13602,7c01.pdb,0,D,DIVMTQSPSSLSASVGDRVTITCRASQSISRYLNWYQQKPGKAPKL...,-|-|-|E|E|E|E|-|S|E|E|E|E|E|T|T|-|-|E|E|E|E|E|...,0.889571|0.076923|0.591549|0.047872|0.478873|0...,360.0|-81.2|-122.0|-111.0|-117.1|-98.4|-143.4|...,131.2|141.0|134.7|124.7|119.5|143.4|136.7|172....
13603,7c01.pdb,0,H,EVQLVESGGGLVQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLE...,-|-|E|E|E|E|E|-|-|-|E|E|-|T|T|-|-|E|E|E|E|E|E|...,0.804124|0.126761|0.520202|0.036585|0.485915|0...,360.0|-123.4|-142.7|-127.0|-118.5|-112.8|-148....,170.8|138.5|136.9|118.4|129.5|150.9|-177.4|-11...


In [None]:
#@title ANARCI installation

!apt install hmmer
!tar xkf anarci-1.3.tar.gz
cmd('cd anarci-1.3/ && python2 setup.py install')
clear_output()

In [None]:
#@title Sequence numbering

%%python2

from IPython.display import clear_output, display
import json
import pandas as pd

import anarci as an

options = dict(
    scheme='martin',
    assign_germline=True,
    allowed_species='human')

type_map = dict(H='heavy', L='light')

df = pd.read_csv('docked_preprocessed.csv')

adfr = []
adfa = []
adft = []

for i, row in df.iterrows():
  filename = row.File
  mid = row.Model
  for key, seq, btype in [(row.HKey, row.Heavy, 'H'), (row.LKey, row.Light, 'L')]:
    seqs = [(key, seq)]
    
    # print filename[:-4], mid, key

    results = list(zip(*an.run_anarci(seqs, allow=type_map[btype], **options)))
    if any(not ns for _, ns, _, _ in results):
      results = list(zip(*an.run_anarci(seqs, allow='ig', **options)))
    if any(not ns for _, ns, _, _ in results):
      results = list(zip(*an.run_anarci(seqs, **options)))

    for (key, seq), numbers, alignments, hit_tables in results:

      if numbers:
        dfr = pd.DataFrame(dict(
            Res=res,
            Num=num,
            Sub=sub.strip() or '-',
        ) for i, (residues, start, end) in enumerate(numbers) for (num, sub), res in residues)
        dfr.insert(0, 'File', filename)
        dfr.insert(1, 'Model', mid)
        dfr.insert(2, 'Key', key)
        dfr.insert(3, 'Type', btype)
        adfr.append(dfr)

      if alignments:
        dfa = pd.DataFrame(alignments)
        dfa.insert(0, 'File', filename)
        dfa.insert(1, 'Model', mid)
        adfa.append(dfa)

      if hit_tables:
        dft = pd.DataFrame(hit_tables[1:], columns=hit_tables[0])
        dft.insert(0, 'File', filename)
        dft.insert(1, 'Model', mid)
        dft.insert(2, 'Key', key)
        dft.insert(3, 'Type', btype)
        adft.append(dft)

pd.concat(adfr).to_csv('anarci_residues.csv', index=False)
pd.concat(adfa).to_csv('anarci_alignments.csv', index=False)
pd.concat(adft).to_csv('anarci_hit_tables.csv', index=False)
clear_output()
print('Completed.')

In [None]:
#@title CDR extraction

def cdr_range(df, btype, start, end):
  return (df.Type == btype) & (df.Num >= start) & (df.Num <= end)

df = pd.read_csv('anarci_residues.csv')
df['CDR'] = '-'

dfL = df[df.Type == 'L']
dfH = df[df.Type == 'H']

df.loc[cdr_range(df, 'L', 30, 36), 'CDR'] = 'L1'
df.loc[cdr_range(df, 'L', 46, 55), 'CDR'] = 'L2'
df.loc[cdr_range(df, 'L', 89, 96), 'CDR'] = 'L3'

df.loc[cdr_range(df, 'H', 30, 35), 'CDR'] = 'H1'
df.loc[cdr_range(df, 'H', 47, 58), 'CDR'] = 'H2'
df.loc[cdr_range(df, 'H', 93, 101), 'CDR'] = 'H3'

df.to_csv('cdr_martin_residues.csv', index=False)
df

Unnamed: 0,File,Key,Model,Num,Res,Sub,Type,CDR
0,1a14.pdb,H,0,1.0,Q,-,H,-
1,1a14.pdb,H,0,2.0,V,-,H,-
2,1a14.pdb,H,0,3.0,Q,-,H,-
3,1a14.pdb,H,0,4.0,L,-,H,-
4,1a14.pdb,H,0,5.0,Q,-,H,-
...,...,...,...,...,...,...,...,...
1174823,7c01.pdb,L,0,103.0,K,-,L,-
1174824,7c01.pdb,L,0,104.0,L,-,L,-
1174825,7c01.pdb,L,0,105.0,E,-,L,-
1174826,7c01.pdb,L,0,106.0,I,-,L,-


In [None]:
#@title CDR preprocessing

dfr = pd.read_csv('cdr_martin_residues.csv')

# display(dfm[dfm.pdb == '1a14'])
# display(dfr[:500:10])

data = []
for (file, model, key, btype), dff in dfr.groupby(['File', 'Model', 'Key', 'Type']):

  dff = dff.reset_index(drop=True)
  item = dict(
      File=file,
      Model=model,
      Key=key,
      Type=btype,
  )
  data.append(item)

  for bnum in (1, 2, 3):
    cdr = f'{btype}{bnum}'
    dffc = dff[dff.CDR == cdr]
    item[f'Seq{bnum}'] = ''.join(dffc.Res)
    item[f'Start{bnum}'] = dffc.index[0]
    item[f'EndInc{bnum}'] = dffc.index[-1]

df = pd.DataFrame(data)
# df = df.sort_values(['File', 'Model'])###
# df = df.drop_duplicates(['Seq1', 'Seq2', 'Seq3'])
df.to_csv('cdr_preprocessed.csv', index=False)
df

Unnamed: 0,File,Model,Key,Type,Seq1,Start1,EndInc1,Seq2,Start2,EndInc2,Seq3,Start3,EndInc3
0,1a14.pdb,0,H,H,TNYNMY,29,34,WIGIFYPGNGDTS,46,58,ARSGGSYRYDGGFD,96,109
1,1a14.pdb,0,L,L,SNYLNWY,29,35,LLIYYTSNLH,45,54,QQDFTLPF,88,95
2,1a2y.pdb,0,A,L,HNYLAWY,29,35,LLVYYTTTLA,45,54,QHFWSTPR,88,95
3,1a2y.pdb,0,B,H,TGYGVN,29,34,WLGMIWGDGNTD,46,57,ARERDYRLD,95,103
4,1a3l.pdb,0,H,H,TNYWLG,29,34,WIGDIYPGGVYTT,46,58,ARAGGYYTGGD,96,106
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9321,7bz5.pdb,0,L,L,SSYLAWY,29,35,LLIYAASTLQ,45,54,QQLNSYPPY,88,96
9322,7c01.pdb,0,C,H,SSNYMS,29,34,WVSVIYSGGSTF,46,57,ARVLPMYGDYLD,95,106
9323,7c01.pdb,0,D,L,SRYLNWY,29,35,LLIYAASSLQ,45,54,QQSYSTPPEY,88,97
9324,7c01.pdb,0,H,H,SSNYMS,29,34,WVSVIYSGGSTF,46,57,ARVLPMYGDYLD,95,106


In [None]:
#@title CDR flattening

dfp = pd.read_csv('cdr_preprocessed.csv')

adff = []
for bnum in (1, 2, 3):
  dff = dfp[f'File,Model,Key,Type,Seq{bnum},Start{bnum},EndInc{bnum}'.split(',')]
  dff = dff.rename(columns={
      f'Seq{bnum}': 'Seq',
      f'Start{bnum}': 'Start',
      f'EndInc{bnum}': 'EndInc',
  })
  dff.insert(4, 'Region', bnum)
  adff.append(dff)

df = pd.concat(adff)
df = df.sort_values(['File', 'Model', 'Type', 'Region'])
df.to_csv('cdr_flattened.csv', index=False)
df

Unnamed: 0,File,Model,Key,Type,Region,Seq,Start,EndInc
0,1a14.pdb,0,H,H,1,TNYNMY,29,34
0,1a14.pdb,0,H,H,2,WIGIFYPGNGDTS,46,58
0,1a14.pdb,0,H,H,3,ARSGGSYRYDGGFD,96,109
1,1a14.pdb,0,L,L,1,SNYLNWY,29,35
1,1a14.pdb,0,L,L,2,LLIYYTSNLH,45,54
...,...,...,...,...,...,...,...,...
9325,7c01.pdb,0,L,L,1,SRYLNWY,29,35
9323,7c01.pdb,0,D,L,2,LLIYAASSLQ,45,54
9325,7c01.pdb,0,L,L,2,LLIYAASSLQ,45,54
9323,7c01.pdb,0,D,L,3,QQSYSTPPEY,88,97


In [None]:
#@title CDR contact point refinement

dfc = pd.read_csv('contacts_filtered.csv')
dfr = pd.read_csv('cdr_preprocessed.csv')

df = dfc.merge(dfr.rename(columns={'Key': 'BKey', 'Type': 'BType'}), on=['File', 'Model', 'BKey', 'BType'])

df['CDR'] = 0
df['CDR Start'] = 0
df['CDR EndInc'] = 0
for bnum in(1, 2, 3):
  cond = (df.BIndex >= df[f'Start{bnum}']) & (df.BIndex <= df[f'EndInc{bnum}'])
  df.loc[cond, 'CDR'] = bnum
  df.loc[cond, 'CDR Start'] = df[f'Start{bnum}']
  df.loc[cond, 'CDR EndInc'] = df[f'EndInc{bnum}']
  df = df.drop(columns=[f'Seq{bnum}', f'Start{bnum}', f'EndInc{bnum}'])

df = df[df.CDR != 0]
df.to_csv('contacts_cdr_filtered.csv', index=False)
df

Unnamed: 0,File,Model,BType,BKey,BIndex,BResidue,AKey,AIndex,AResidue,Distance,LKey,HKey,Resolution,BWindow,AWindow,CDR,CDR Start,CDR EndInc
0,1a14.pdb,0,H,H,104,TYR,N,248,ASN,3.3984,L,H,2.50,GSYRYDGGF,NPRPNDPTV,3,96,109
1,1a14.pdb,0,H,H,105,ASP,N,248,ASN,3.9778,L,H,2.50,SYRYDGGFD,NPRPNDPTV,3,96,109
2,1a14.pdb,0,H,H,102,TYR,N,284,ILE,3.4406,L,H,2.50,SGGSYRYDG,LGRTISIAS,3,96,109
3,1a14.pdb,0,H,H,56,ASP,N,285,SER,4.8320,L,H,2.50,PGNGDTSYN,GRTISIASR,2,46,58
4,1a14.pdb,0,H,H,104,TYR,N,285,SER,4.4363,L,H,2.50,GSYRYDGGF,GRTISIASR,3,96,109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233723,7c01.pdb,0,L,L,91,TYR,A,170,VAL,4.9430,L,H,2.88,CQQSYSTPP,PTNGVGYQP,3,88,97
233724,7c01.pdb,0,L,L,91,TYR,A,171,GLY,4.0228,L,H,2.88,CQQSYSTPP,TNGVGYQPY,3,88,97
233725,7c01.pdb,0,L,L,29,SER,A,172,TYR,3.4438,L,H,2.88,SQSISRYLN,NGVGYQPYR,1,29,35
233726,7c01.pdb,0,L,L,31,TYR,A,172,TYR,2.3923,L,H,2.88,SISRYLNWY,NGVGYQPYR,1,29,35


In [None]:
#@title Feature assembly

dfs = pd.read_csv('docked_secondary.csv')

dfc = pd.read_csv('contacts_cdr_filtered.csv')

data = []
for (file, model, bkey, akey), dffc in dfc.groupby(['File', 'Model', 'BKey', 'AKey']):
  print(file, len(dffc))

  sec_rel = dfs[(dfs.File == file) & (dfs.Model == model)].set_index('Key')
  
  if bkey not in sec_rel.index or akey not in sec_rel.index:
    print('Missing secondary structure')
    continue

  bsec = sec_rel.loc[bkey]
  b_ss = bsec.SS.split('|')
  b_asa = bsec.ASA.split('|')

  asec = sec_rel.loc[akey]
  a_ss = asec.SS.split('|')
  a_asa = asec.ASA.split('|')

  for i, row in dffc.iterrows():
    bi = row.BIndex
    ai = row.AIndex

    data.append(dict(
        File=file,
        Model=model,
        Type=row.BType,
        CDR=row.CDR,
        CDR_S=row['CDR Start'],
        CDR_EI=row['CDR EndInc'],
        BKey=bkey,
        BIndex=bi,
        BWindow=row.BWindow,
        BSec='|'.join(b_ss[bi - contact_buffer:bi + contact_buffer + 1]),
        BSol='|'.join(b_asa[bi - contact_buffer:bi + contact_buffer + 1]),
        AKey=akey,
        AIndex=ai,
        AWindow=row.AWindow,
        ASec='|'.join(a_ss[ai - contact_buffer:ai + contact_buffer + 1]),
        ASol='|'.join(a_asa[ai - contact_buffer:ai + contact_buffer + 1]),
        Distance=row.Distance,
    ))

clear_output()
df = pd.DataFrame(data)
df = df[(df.BWindow.str.len() == contact_window_size) & (df.AWindow.str.len() == contact_window_size)]
df = df.dropna()
df = df.drop_duplicates(['BWindow', 'AWindow'])
df.to_csv('features_contacts.csv', index=False)
df

Unnamed: 0,File,Model,Type,CDR,CDR_S,CDR_EI,BKey,BIndex,BWindow,BSec,BSol,AKey,AIndex,AWindow,ASec,ASol,Distance
0,1a14.pdb,0,H,3,96,109,H,104,GSYRYDGGF,T|T|T|S|S|-|S|-|E,0.142857|0.253846|0.36036|0.455645|0.031532|0....,N,248,NPRPNDPTV,S|S|-|-|-|-|-|S|S,0.019108|0.183824|0.020161|0.0|0.076433|0.1840...,3.3984
1,1a14.pdb,0,H,3,96,109,H,105,SYRYDGGFD,T|T|S|S|-|S|-|E|E,0.253846|0.36036|0.455645|0.031532|0.202454|0....,N,248,NPRPNDPTV,S|S|-|-|-|-|-|S|S,0.019108|0.183824|0.020161|0.0|0.076433|0.1840...,3.9778
2,1a14.pdb,0,H,3,96,109,H,102,SGGSYRYDG,E|T|T|T|T|S|S|-|S,0.0|0.345238|0.142857|0.253846|0.36036|0.45564...,N,284,LGRTISIAS,E|E|E|-|S|-|S|S|S,0.006098|0.0|0.092742|0.042254|0.142012|0.1076...,3.4406
3,1a14.pdb,0,H,2,46,58,H,56,PGNGDTSYN,T|T|T|T|E|E|E|E|-,0.036765|0.22619|0.203822|0.369048|0.030675|0....,N,285,GRTISIASR,E|E|-|S|-|S|S|S|S,0.0|0.092742|0.042254|0.142012|0.107692|0.0591...,4.8320
4,1a14.pdb,0,H,3,96,109,H,104,GSYRYDGGF,T|T|T|S|S|-|S|-|E,0.142857|0.253846|0.36036|0.455645|0.031532|0....,N,285,GRTISIASR,E|E|-|S|-|S|S|S|S,0.0|0.092742|0.042254|0.142012|0.107692|0.0591...,4.4363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192811,7c01.pdb,0,L,1,29,35,D,31,SISRYLNWY,-|-|T|T|-|E|E|E|E,0.238462|0.0|0.061538|0.149194|0.054054|0.0|0....,B,172,NGVGYQPYR,S|-|G|G|G|S|E|E|E,0.038217|0.285714|0.56338|0.166667|0.018018|0....,2.3143
192812,7c01.pdb,0,L,3,88,97,D,91,CQQSYSTPP,E|E|E|-|-|-|S|S|-,0.0|0.0|0.0|0.069231|0.045045|0.138462|0.21126...,B,172,NGVGYQPYR,S|-|G|G|G|S|E|E|E,0.038217|0.285714|0.56338|0.166667|0.018018|0....,3.4287
192813,7c01.pdb,0,H,2,46,57,H,57,GGSTFYADS,S|S|-|E|E|E|-|T|T,0.214286|0.404762|0.092308|0.359155|0.137056|0...,A,75,GDEVRQIAP,G|G|G|G|G|G|S|S|T,0.214286|0.325153|0.041237|0.084507|0.387097|0...,4.7304
192831,7c01.pdb,0,H,3,95,106,H,102,LPMYGDYLD,E|-|S|S|S|-|E|E|E,0.036585|0.0|0.154255|0.382883|0.059524|0.0797...,A,120,YNYLYRLFR,-|-|-|E|E|E|-|-|-,0.400901|0.707006|0.076577|0.22561|0.045045|0....,4.9752


In [None]:
#@title CDR window collection

dfc = pd.read_csv('cdr_flattened.csv')

dfd = pd.read_csv('docked_secondary.csv')
dfd = dfd[dfd.Compact.str.len() >= 9]

dfcd = dfc.merge(dfd, on=['File', 'Model', 'Key'])

data = []
for i, row in dfcd.iterrows():
  if i % 100 == 0:
    print(row.File)

  for index in range(row.Start, row.EndInc + 1):
    window = row.Compact[index - contact_buffer:index + contact_buffer + 1]
    ss_window = '|'.join(row.SS.split('|')[index - contact_buffer:index + contact_buffer + 1])
    asa_window = '|'.join(row.ASA.split('|')[index - contact_buffer:index + contact_buffer + 1])

    data.append(dict(
        File=row.File,
        Model=row.Model,
        Type=row.Type,
        CDR=row.Region,
        CDR_S=row.Start,
        CDR_EI=row.EndInc,
        BKey=row.Key,
        BIndex=index,
        BWindow=window,
        BSec=ss_window,
        BSol=asa_window,
    ))

clear_output()
df = pd.DataFrame(data)
df = df[df.BWindow.str.len() == contact_window_size]
df = df.dropna()
df.to_csv('windows_cdr.csv', index=False)
df

Unnamed: 0,File,Model,Type,CDR,CDR_S,CDR_EI,BKey,BIndex,BWindow,BSec,BSol
0,1a14.pdb,0,H,1,29,34,H,29,GYTFTNYNM,S|S|-|G|G|G|S|-|E,0.642857|0.144144|0.570423|0.025381|0.338028|0...
1,1a14.pdb,0,H,1,29,34,H,30,YTFTNYNMY,S|-|G|G|G|S|-|E|E,0.144144|0.570423|0.025381|0.338028|0.464968|0...
2,1a14.pdb,0,H,1,29,34,H,31,TFTNYNMYW,-|G|G|G|S|-|E|E|E,0.570423|0.025381|0.338028|0.464968|0.247748|0...
3,1a14.pdb,0,H,1,29,34,H,32,FTNYNMYWV,G|G|G|S|-|E|E|E|E,0.025381|0.338028|0.464968|0.247748|0.012739|0...
4,1a14.pdb,0,H,1,29,34,H,33,TNYNMYWVK,G|G|S|-|E|E|E|E|E,0.338028|0.464968|0.247748|0.012739|0.010638|0...
...,...,...,...,...,...,...,...,...,...,...,...
510942,7c01.pdb,0,L,3,88,97,L,93,QSYSTPPEY,E|-|-|-|S|S|-|-|-,0.0|0.092308|0.022523|0.115385|0.232394|0.4117...
510943,7c01.pdb,0,L,3,88,97,L,94,SYSTPPEYT,-|-|-|S|S|-|-|-|-,0.092308|0.022523|0.115385|0.232394|0.411765|0...
510944,7c01.pdb,0,L,3,88,97,L,95,YSTPPEYTF,-|-|S|S|-|-|-|-|B,0.022523|0.115385|0.232394|0.411765|0.169118|0...
510945,7c01.pdb,0,L,3,88,97,L,96,STPPEYTFG,-|S|S|-|-|-|-|B|-,0.115385|0.232394|0.411765|0.169118|0.304124|0...


In [None]:
#@title Antigen window collection

dfd = pd.read_csv('docked_secondary.csv')
dfd = dfd[dfd.Compact.str.len() >= 9]

for file, model, hk, lk in dfm[['file', 'model', 'Hchain', 'Lchain']].sort_values('file').values:
  print(file)

  dfd = dfd[(dfd.File != file) | (dfd.Model != model) | ~dfd.Key.isin((hk, lk))]

  print(len(dfd))

data = []
for i, row in dfd.iterrows():
  if i % 100 == 0:
    print(row.File)

  for index in range(contact_buffer, len(row.Compact) - contact_buffer):
    window = row.Compact[index - contact_buffer:index + contact_buffer + 1]
    ss_window = '|'.join(row.SS.split('|')[index - contact_buffer:index + contact_buffer + 1])
    asa_window = '|'.join(row.ASA.split('|')[index - contact_buffer:index + contact_buffer + 1])

    data.append(dict(
        File=row.File,
        Model=row.Model,
        AKey=row.Key,
        AIndex=index,
        AWindow=window,
        ASec=ss_window,
        ASol=asa_window,
    ))

clear_output()
df = pd.DataFrame(data)
df = df[df.AWindow.str.len() == contact_window_size]
df = df.dropna()
df.to_csv('windows_ag.csv', index=False)
df

Unnamed: 0,File,Model,AKey,AIndex,AWindow,ASec,ASol
0,1a14.pdb,0,N,4,RDFNNLTKG,-|-|-|-|-|-|-|S|-,0.677419|0.754601|0.137056|0.191083|0.324841|0...
1,1a14.pdb,0,N,5,DFNNLTKGL,-|-|-|-|-|-|S|-|B,0.754601|0.137056|0.191083|0.324841|0.012195|0...
2,1a14.pdb,0,N,6,FNNLTKGLC,-|-|-|-|-|S|-|B|-,0.137056|0.191083|0.324841|0.012195|0.732394|0...
3,1a14.pdb,0,N,7,NNLTKGLCT,-|-|-|-|S|-|B|-|-,0.191083|0.324841|0.012195|0.732394|0.258537|0...
4,1a14.pdb,0,N,8,NLTKGLCTI,-|-|-|S|-|B|-|-|-,0.324841|0.012195|0.732394|0.258537|0.107143|0...
...,...,...,...,...,...,...,...
1149835,7c01.pdb,0,B,186,FELLHAPAT,E|E|-|-|S|S|-|-|-,0.091371|0.371134|0.75|0.5|0.994565|0.339623|0...
1149836,7c01.pdb,0,B,187,ELLHAPATV,E|-|-|S|S|-|-|-|B,0.371134|0.75|0.5|0.994565|0.339623|0.801471|0...
1149837,7c01.pdb,0,B,188,LLHAPATVC,-|-|S|S|-|-|-|B|-,0.75|0.5|0.994565|0.339623|0.801471|0.40566|0....
1149838,7c01.pdb,0,B,189,LHAPATVCG,-|S|S|-|-|-|B|-|-,0.5|0.994565|0.339623|0.801471|0.40566|0.30985...


In [None]:
#@title Full window collection

dfd = pd.read_csv('docked_secondary.csv')
dfd = dfd[dfd.Compact.str.len() >= 9]

data = []
for i, row in dfd.iterrows():
  if i % 100 == 0:
    print(row.File)

  for index in range(contact_buffer, len(row.Compact) - contact_buffer):
    window = row.Compact[index - contact_buffer:index + contact_buffer + 1]
    ss_window = '|'.join(row.SS.split('|')[index - contact_buffer:index + contact_buffer + 1])
    asa_window = '|'.join(row.ASA.split('|')[index - contact_buffer:index + contact_buffer + 1])

    data.append(dict(
        File=row.File,
        Model=row.Model,
        AKey=row.Key,
        AIndex=index,
        AWindow=window,
        ASec=ss_window,
        ASol=asa_window,
    ))

clear_output()
df = pd.DataFrame(data)
df = df[df.AWindow.str.len() == contact_window_size]
df = df.dropna()
df.to_csv('windows_all.csv', index=False)
df

Unnamed: 0,File,Model,AKey,AIndex,AWindow,ASec,ASol
0,1a14.pdb,0,H,4,QVQLQQSGA,-|-|E|E|E|E|-|-|-,0.853535|0.15493|0.419192|0.067073|0.40404|0.1...
1,1a14.pdb,0,H,5,VQLQQSGAE,-|E|E|E|E|-|-|-|E,0.15493|0.419192|0.067073|0.40404|0.136364|0.2...
2,1a14.pdb,0,H,6,QLQQSGAEL,E|E|E|E|-|-|-|E|E,0.419192|0.067073|0.40404|0.136364|0.269231|0....
3,1a14.pdb,0,H,7,LQQSGAELV,E|E|E|-|-|-|E|E|-,0.067073|0.40404|0.136364|0.269231|0.571429|0....
4,1a14.pdb,0,H,8,QQSGAELVK,E|E|-|-|-|E|E|-|-,0.40404|0.136364|0.269231|0.571429|0.603774|0....
...,...,...,...,...,...,...,...
2706706,7c01.pdb,0,L,206,LSSPVTKSF,S|S|S|-|E|E|E|E|E,0.237805|1.0|0.630769|0.455882|0.28169|0.45774...
2706707,7c01.pdb,0,L,207,SSPVTKSFN,S|S|-|E|E|E|E|E|E,1.0|0.630769|0.455882|0.28169|0.457746|0.38048...
2706708,7c01.pdb,0,L,208,SPVTKSFNR,S|-|E|E|E|E|E|E|T,0.630769|0.455882|0.28169|0.457746|0.380488|0....
2706709,7c01.pdb,0,L,209,PVTKSFNRG,-|E|E|E|E|E|E|T|T,0.455882|0.28169|0.457746|0.380488|0.423077|0....


In [None]:
#@title Geometry parsing

dfp = pd.read_csv('docked_preprocessed.csv')

data = []
for i, file in enumerate(dfp.File.unique()):
  if i % 100 == 0:
    clear_output()
  print(file)

  structure = parse(f'{pdb_dirname}/{file}', pdb_dirname)
  for model in structure.get_models():
    chains = list(model.get_chains())

    for chain in chains:
      residues = list(chain.get_residues())

      for i, res in enumerate(residues):
        coords = np.stack([a.coord for a in res.get_atoms()]).mean(axis=0)

        data.append({
            'File': file,
            'Model': model.id,
            'Key': chain.id,
            'Index': i,
            'Residue': res.resname,
            'X': coords[0],
            'Y': coords[1],
            'Z': coords[2],
        })

    # print(file, [{chain.id: len(chain)} for chain in chains])

clear_output()
df = pd.DataFrame(data)
df[list('XYZ')] = df[list('XYZ')].round(3)
df.to_csv('geometry.csv', index=False)
df

Unnamed: 0,File,Model,Key,Index,Residue,X,Y,Z
0,1a14.pdb,0,N,0,ARG,70.402,76.970,23.883
1,1a14.pdb,0,N,1,ASP,65.324,75.985,19.624
2,1a14.pdb,0,N,2,PHE,61.408,77.582,21.788
3,1a14.pdb,0,N,3,ASN,62.268,71.575,24.281
4,1a14.pdb,0,N,4,ASN,59.883,70.060,21.665
...,...,...,...,...,...,...,...,...
3753195,7c01.pdb,0,D,210,PHE,8.871,22.949,-3.216
3753196,7c01.pdb,0,D,211,ASN,10.007,25.207,0.761
3753197,7c01.pdb,0,D,212,ARG,15.599,26.162,-2.405
3753198,7c01.pdb,0,D,213,GLY,15.538,22.459,1.357


In [None]:
#@title Thera-SAbDab preprocessing

!wget -nc -O sab_thera.csv http://opig.stats.ox.ac.uk/webapps/newsabdab/static/downloads/TheraSAbDab_SeqStruc_OnlineDownload_Feb20.csv

dft = pd.read_csv('sab_thera.csv')

data = []
for i, row in dft.iterrows():
  for similarity in ['100%', '99%', '95-98%']:
    rels = row[f'{similarity} SI Structure']
    if not isinstance(rels, str):
      continue
    for part in rels.split(';'):
      for entry in part.split('/'):
        pdb_name, *chains = entry.split(':')
        for chain in chains:
          if len(chain) != 2:
            continue
          hk, lk = chain
          if hk == lk:
            print('Skipping:', pdb_name, [hk, lk])
            continue
          data.append({
              'File': pdb_name + '.pdb',
              'Name': row.Therapeutic,
              'Format': row.Format,
              'Isotype': row['CH1 Isotype'],
              'VDLC': row['VD LC'],
              'Status': row['Est. Status'],
              'Target': row.Target,
              'Similarity': similarity,
              'HKey': hk,
              'LKey': lk,
              'Heavy': row['Heavy Sequence'],
              'Heavy2': row['Heavy Sequence (if bispec)'],
              'Heavy3': row['Heavy Sequence (if trispec)'],
              'Light': row['Light Sequence'],
              'Light2': row['Light Sequence (if bispec)'],
              'Light3': row['Light Sequence (if trispec)'],
          })

clear_output()
df = pd.DataFrame(data)
df = df.replace('na', '')
df = df.sort_values(['File', 'Name'])
df.to_csv('thera_preprocessed.csv', index=False)
df

Unnamed: 0,File,Name,Format,Isotype,VDLC,Status,Target,Similarity,HKey,LKey,Heavy,Heavy2,Heavy3,Light,Light2,Light3
14,1bey.pdb,Alemtuzumab,Whole mAb,G1,Kappa,Active,CD52,100%,H,L,QVQLQESGPGLVRPSQTLSLTCTVSGFTFTDFYMNWVRQPPGRGLE...,,,DIQMTQSPSSLSASVGDRVTITCKASQNIDKYLNWYQQKPGKAPKL...,,
510,1bfo.pdb,Tamtuvetmab,Canine Whole mAb,G2,Kappa,Active,CD52,95-98%,H,G,EVKLLESGGGLVQPGGSMRLSCAGSGFTFTDFYMNWIRQPAGKAPE...,,,DIKMTQSPSFLSASVGDRVTLNCKASQNIDKYLNWYQQKLGESPKL...,,
511,1bfo.pdb,Tamtuvetmab,Canine Whole mAb,G2,Kappa,Active,CD52,95-98%,F,E,EVKLLESGGGLVQPGGSMRLSCAGSGFTFTDFYMNWIRQPAGKAPE...,,,DIKMTQSPSFLSASVGDRVTLNCKASQNIDKYLNWYQQKLGESPKL...,,
512,1bfo.pdb,Tamtuvetmab,Canine Whole mAb,G2,Kappa,Active,CD52,95-98%,B,A,EVKLLESGGGLVQPGGSMRLSCAGSGFTFTDFYMNWIRQPAGKAPE...,,,DIKMTQSPSFLSASVGDRVTLNCKASQNIDKYLNWYQQKLGESPKL...,,
513,1bfo.pdb,Tamtuvetmab,Canine Whole mAb,G2,Kappa,Active,CD52,95-98%,D,C,EVKLLESGGGLVQPGGSMRLSCAGSGFTFTDFYMNWIRQPAGKAPE...,,,DIKMTQSPSFLSASVGDRVTLNCKASQNIDKYLNWYQQKLGESPKL...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,6ugt.pdb,Infliximab,Whole mAb,G1,Kappa,NFD,TNFA,100%,A,B,EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE...,,,DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL...,,
296,6ugu.pdb,Infliximab,Whole mAb,G1,Kappa,NFD,TNFA,100%,H,L,EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE...,,,DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL...,,
297,6ugu.pdb,Infliximab,Whole mAb,G1,Kappa,NFD,TNFA,100%,A,B,EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE...,,,DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL...,,
292,6ugv.pdb,Infliximab,Whole mAb,G1,Kappa,NFD,TNFA,100%,H,L,EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE...,,,DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL...,,


In [None]:
#@title Thera-SAbDab prioritization

dft = pd.read_csv('thera_preprocessed.csv')
dfd = pd.read_csv('docked_preprocessed.csv')

best_files = dft[(dft.Similarity == '100%') & (dft.Status == 'Active') & dft.File.isin(dfd.File)].File.unique()

# print(len(best_files))

dfo = pd.read_csv('docked_oneletters.csv')
dfo = dfo[dfo.File.isin(dft.File)]

assert len(dfo) > 0 and len(dfo.drop_duplicates(['File', 'Key'])) == len(dfo)

dft = dft.merge(dfm.rename(columns={'file': 'File'}), on='File')
dft = dft[~dft.antigen_chain.isna()]

data = []
for i, row in dft.iterrows():
  for ak in row.antigen_chain.split('|'):
    data.append(dict(
        File=row.File,
        Model=row.model,
        HKey=row.Hchain,
        LKey=row.Lchain,
        AKey=ak.strip(),
        Name=row.Name,
        Resolution=row.resolution,
        Priority=
          (row.Similarity == '100%') * 2 +
          (row.Status == 'Active') * 2 +
          (row.Similarity == '99%') * 1 +
          (row.Status == 'Discontinued') * 1,
    ))

df = pd.DataFrame(data)
df['inv'] = -df.Priority
df = df.sort_values(['inv', 'File', 'Model'])
df = df.drop(columns='inv')
df = df.dropna()
df = df.drop_duplicates()
df.to_csv('thera_prioritized.csv', index=False)
df

Unnamed: 0,File,Model,HKey,LKey,AKey,Name,Resolution,Priority
0,1bj1.pdb,0,H,L,W,Bevacizumab,2.4,4
1,1bj1.pdb,0,K,J,V,Bevacizumab,2.4,4
4,1bj1.pdb,0,H,L,W,Dilpacimab,2.4,4
5,1bj1.pdb,0,K,J,V,Dilpacimab,2.4,4
20,1ce1.pdb,0,H,L,P,Alemtuzumab,1.9,4
...,...,...,...,...,...,...,...,...
861,6bah.pdb,0,B,A,C,Timigutuzumab,1.9,0
862,6bah.pdb,0,B,A,E,Timigutuzumab,1.9,0
863,6bah.pdb,0,B,A,C,Trastuzumab,1.9,0
864,6bah.pdb,0,B,A,E,Trastuzumab,1.9,0


In [None]:
#@title CoV-AbDab download

!wget -nc http://opig.stats.ox.ac.uk/webapps/covabdab/static/downloads/CoV-AbDab_140620.csv -O cov_abdab.csv
!wget -nc http://opig.stats.ox.ac.uk/webapps/covabdab/static/downloads/CoV-AbDab_HomologyModels_110620.tar.gz -O cov_homologies.tar.gz
!wget -nc http://opig.stats.ox.ac.uk/webapps/covabdab/static/downloads/CoV-AbDab_PDBStructures_100620.tar.gz -O cov_structures.tar.gz
!mkdir -p cov_structures/ && tar xkf cov_structures.tar.gz -C cov_structures/
!mkdir -p cov_homologies/ && tar xkf cov_homologies.tar.gz -C cov_homologies/
clear_output()

cmd('rm -rf cov/ && mkdir -p cov/')

for filename in os.listdir('cov_structures/Structures/'):
  cmd(f'cp cov_structures/Structures/{filename} cov/{filename}')

for filename in os.listdir('cov_homologies/'):
  new_name = filename.replace('_rank1_imgt_scheme', '')
  assert not os.path.exists(f'cov/{new_name}'), 'Overlapping homology filename: ' + new_name
  cmd(f'cp cov_homologies/{filename} cov/{new_name}')

dfv = pd.read_csv('cov_abdab.csv')
dfv = dfv.rename(columns={
    'Ab or Nb': 'Type',
    'Binds to': 'Bind',
    'Doesn\'t Bind to': 'X_Bind',
    'Neutralising Vs': 'Neutralize',
    'Not Neutralising Vs': 'X_Neutralize',
    'Protein + Epitope': 'Domain',
    'VH or VHH': 'Heavy_Info',
    'VL': 'Light_Info',
    'Heavy V Gene': 'HV_Gene',
    'Heavy J Gene': 'HJ_Gene',
    'Light V Gene': 'LV_Gene',
    'Light J Gene': 'LJ_Gene',
    'ABB Homology Model (if no structure)': 'Homology',
    'Date Added': 'Added',
    'Last Updated': 'Updated',
    'Notes/Following Up?': 'Status',
})
dfv.Homology = dfv.Homology.str.replace('%20', ' ')
dfv = dfv.replace('ND', '')
dfv = dfv.replace(np.nan, '')
dfv.to_csv('cov_details.csv')
dfv

Unnamed: 0,Name,Type,Bind,X_Bind,Neutralize,X_Neutralize,Domain,Origin,Heavy_Info,Light_Info,HV_Gene,HJ_Gene,LV_Gene,LJ_Gene,CDRH3,CDRL3,Structures,Homology,Sources,Added,Updated,Update Description,Status
0,CA1,Ab,SARS-CoV2,,SARS-CoV2,,S; RBD,B-cells; SARS CoV2 Human Patient,EVQLVQSGAEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLE...,DIVMTQTPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...,IGHV1-18 (Human),IGHJ6 (Human),IGKV3-11 (Human),IGKJ3 (Human),AREGYCSGGSCYSGYYYYYGMDV,QQRRNWGT,,Coronavirus Binding Antibody Sequences Structu...,"Rui Shi et al., 2020 (https://www.nature.com/a...","May 28, 2020","May 28, 2020",,Complete
1,CB6,Ab,SARS-CoV2,,SARS-CoV2,,S; RBD,B-cells; SARS CoV2 Human Patient,EVQLVESGGGLVQPGGSLRLSCAASGFTVSSNYMSWVRQAPGKGLE...,DIVMTQSPSSLSASVGDRVTITCRASQSISRYLNWYQQKPGKAPKL...,IGHV3-66 (Human),IGHJ4 (Human),IGKV1-39 (Human),IGKJ2 (Human),ARVLPMYGDYLDY,QQSYSTPPEYT,https://www.rcsb.org/structure/7C01,,"Rui Shi et al., 2020 (https://www.nature.com/a...","May 28, 2020","May 28, 2020",,Complete
2,EY6A,Ab,SARS-CoV2,,SARS-CoV2,,S; RBD,B-cells; SARS CoV2 Human Patient,,,,,,,,,,,"Daming Zhou et al., 2020 (https://www.biorxiv....","Jun 14, 2020","Jun 14, 2020",,Awaiting sequence and structure when PDB file ...
3,MD17,Ab,SARS-CoV2,,SARS-CoV2 (weak),,S; RBD,"Phage Display Library (Antibody, human, immune...",,,IGHV3-64 (Human),,IGKV1-39 (Human),,VKDQDSSSWYDAFDI,QQSYTTPLT,,,"Tal Noy-Porat et al., 2020 (https://www.biorxi...","May 21, 2020","May 26, 2020",Added first two residues of CDRH3 sequence,Tracking public release of patent application ...
4,MD29,Ab,SARS-CoV2,,SARS-CoV2 (weak),,S; RBD,"Phage Display Library (Antibody, human, immune...",,,IGHV3-64 (Human),,IGKV1-39 (Human),,VKDQDSSSWYDAFDI,HQTYTSPYT,,,"Tal Noy-Porat et al., 2020 (https://www.biorxi...","May 21, 2020","May 26, 2020",Added first two residues of CDRH3 sequence,Tracking public release of patent application ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,DA43,Ab,Bovine-CoV,,Bovine-CoV (Strong),,S; S2,Unk,EVQLQQSGAELVRPGTSVKVSCKASGYAFTNYLIEWVKQRPGQGLE...,QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLF...,IGHV1-54 (Mouse),IGHJ4 (Mouse),IGLV1 (Mouse),IGLJ1 (Mouse),ARYTLYYGNPYYYAMDY,ALWFSNHWV,,Coronavirus Binding Antibody Sequences Structu...,EP1857116 (https://patentimages.storage.google...,"May 23, 2020","May 23, 2020",,Complete. Our thanks to Dr. Konrad Krawczyk fo...
560,GPI-scFv,Ab,Viral Envelopes,,,,E,Unk,QVQLQQWGAGLLKPSETLSLTCAVTGDSFGGHHWSWIRQPPGKGLE...,LMETTQSPPTLSASVGDRVTITCRASQSLSGWVAWYQQKPGKAPKL...,IGHV4-34 (Human),IGHJ3 (Mouse),IGKV1-5 (Human),IGKJ2 (Mouse),ARRETMETASRKRAFDI,QQHSAFSST,,Coronavirus Binding Antibody Sequences Structu...,EP1857116 (https://patentimages.storage.google...,"May 04, 2020","May 04, 2020",,Complete
561,3D8,Ab,Viral Nucleic Acid Chains,,,,,Immunised Mice,EVQLQQSGPELVKPGASVKMSCKASGYTFTSYVMHWVNQNPGQGLE...,DIVMSQSPSSLAVSAGEKVTMSCKSSQSLFNSRTRKNYLAWYQQKP...,IGHV1-14 (Mouse),IGHJ4 (Mouse),IGKV8-21 (Mouse),IGKJ2 (Mouse),ARGAYKRGYAMDY,KQSYYHMYT,,Coronavirus Binding Antibody Sequences Structu...,WO2008035894 (https://patentimages.storage.goo...,"May 04, 2020","May 04, 2020",,Complete
562,1AF10,Ab,TGEV,,TGEV,,S,Immunised Mice,QVQLQQSGPELVKPGASVKISCKASGYAFSSSWMNWVKQRPGQGLE...,DILLTQSPAILSVSPGERVSLSCRASQSIGTSIHWYQQRTNGSPRP...,IGHV1-82 (Mouse),IGHJ4 (Mouse),IGKV5-48 (Mouse),IGKJ5 (Mouse),ARGGYRYDPYYAMDY,QQTDSWPTT,https://www.rcsb.org/structure/4F2M,,"Juan Reguera et al., 2012 (https://journals.pl...","May 04, 2020","May 04, 2020",,Complete


In [None]:
#@title CoV-AbDab preprocessing

dfv = pd.read_csv('cov_details.csv')

data = []
rcsb_prefix = 'https://www.rcsb.org/structure/'
for i, row in dfv.iterrows():
  paths = (
      [s[len(rcsb_prefix):].lower() + '.pdb' for s in str(row.Structures).split(';') if s.startswith(rcsb_prefix)] +
      (['cov/' + row.Homology[row.Homology.rindex('/') + 1:].replace('_rank1_imgt_scheme', '')] if '/' in str(row.Homology) else [])
  )
  files = [p for p in paths]
  cols = ['Name', 'Type', 'Bind', 'X_Bind', 'Neutralize', 'X_Neutralize', 'Domain', 'Origin']
  for file in files:
    data.append({
        'File': file,
        **{c: row[c] for c in cols},
    })

df = pd.DataFrame(data)
df.to_csv('cov_preprocessed.csv')
df

Unnamed: 0,File,Name,Type,Bind,X_Bind,Neutralize,X_Neutralize,Domain,Origin
0,cov/CA1.pdb,CA1,Ab,SARS-CoV2,,SARS-CoV2,,S; RBD,B-cells; SARS CoV2 Human Patient
1,7c01.pdb,CB6,Ab,SARS-CoV2,,SARS-CoV2,,S; RBD,B-cells; SARS CoV2 Human Patient
2,7byr.pdb,BD23,Ab,SARS-CoV2,,SARS-CoV2,,S; RBD,B-cells; SARS CoV2 Human Patient
3,7bwj.pdb,P2B-2F6,Ab,SARS-CoV2,SARS-CoV1,SARS-CoV2,,S; RBD,B-cells; SARS CoV2 Human Patient
4,cov/C002.pdb,C002,Ab,SARS-CoV2,SARS-CoV1,SARS-CoV2,SARS-CoV1,S; RBD,B-cells; SARS CoV2 Human Patient
...,...,...,...,...,...,...,...,...,...
459,cov/DA31.pdb,DA31,Ab,Bovine-CoV,,Bovine-CoV (Strong),,S; S2,Unk
460,cov/DA43.pdb,DA43,Ab,Bovine-CoV,,Bovine-CoV (Strong),,S; S2,Unk
461,cov/GPI_scFv.pdb,GPI-scFv,Ab,Viral Envelopes,,,,E,Unk
462,cov/3D8.pdb,3D8,Ab,Viral Nucleic Acid Chains,,,,,Immunised Mice
