<a href="https://colab.research.google.com/github/rvanasa/deep-antibody/blob/master/antibody_data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#@title Workspace setup

!pip install -q biopython pdb-tools
!wget -nc ftp://ftp.cmbi.ru.nl/pub/software/dssp/dssp-2.0.4-linux-amd64 -O /usr/local/bin/dssp && chmod +x /usr/local/bin/dssp

from IPython.display import clear_output, display
clear_output()

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import Bio
import Bio.PDB
from Bio.PDB import DSSP

contact_buffer = 4
contact_window_size = contact_buffer * 2 + 1

amino_acids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL', '???']
oneletters = 'ARNDCEQGHILKMFPSTWYV'
ssletters = 'HBEGITS'

parser = Bio.PDB.PDBParser(get_header=True)

def parse(ident):
  if '.' not in ident:
    filename = f'{ident}.pdb'
    !wget -nc https://files.rcsb.org/download/{ident}.pdb
  else:
    filename = ident
  return parser.get_structure(ident, filename)


def run_dssp(filename):
  structure = parse(filename)
  adfs = []
  for model in structure.get_models():
    chain_info = []
    for chain in model.get_chains():
      chain_info += [(chain.id, i - 1, create_seq([r.resname])) for i, r in enumerate(chain)]
    
    dssp = DSSP(structure[0], filename)
    rows = [(c, i, rc.strip() or '-', *v) for (c, (_, i, rc)), v in dssp.property_dict.items()]

    dfs = pd.DataFrame(rows, columns=[
        'Key', 'Num', 'Sub',
        'DSSP Index', 'Residue',
        'SS', 'ASA', 'Phi', 'Psi',
        'NH->O_1_relidx', 'NH->O_1_energy',
        'O->NH_1_relidx', 'O->NH_1_energy',
        'NH->O_2_relidx', 'NH->O_2_energy',
        'O->NH_2_relidx', 'O->NH_2_energy'])
    
    dfs.insert(0, 'Model', model.id)
    adfs.append(dfs)
  
  return pd.concat(adfs)


def create_seq(rs):
  return ''.join(oneletters[amino_acids.index(r)] if r in amino_acids else 'X' for r in rs)


def cmd(command):
  if not isinstance(command, str):
    for c in command:
      cmd(c)
  if os.system(command):
    raise Exception(f'Non-zero exit code in command: $ {command}')

In [0]:
#@title SAbDab caching

!wget -nc -O sab_summary_all.tsv http://opig.stats.ox.ac.uk/webapps/newsabdab/sabdab/summary/all/
!wget -nc -O sab_all.zip http://opig.stats.ox.ac.uk/webapps/newsabdab/sabdab/archive/all/
!unzip -q -n sab_all.zip
!rm -rf __MACOSX
clear_output()

pdb_dirname = './all_structures/raw'
pdb_files = os.listdir(pdb_dirname)

dfm = pd.read_csv('sab_summary_all.tsv', sep='\t')
dfm['file'] = dfm.pdb.map(lambda p: p + '.pdb')

In [0]:
#@title Chain parsing

data = []

for file in pdb_files:
  structure = parse(f'{pdb_dirname}/{file}')
  for model in structure.get_models():
    chains = list(model.get_chains())

    for chain in chains:
      data.append({
          'File': file,
          'Model': model.id,
          'Key': chain.id,
          'Sequence': ','.join(r.resname for r in chain),
      })

    # print(file, [{chain.id: len(chain)} for chain in chains])

clear_output()
df = pd.DataFrame(data)
df['Compact'] = df.Sequence.map(lambda rs: create_seq(rs.split(',')))
df[['File', 'Model', 'Key', 'Sequence']].to_csv('docked_seqs.csv', index=False)
df[['File', 'Model', 'Key', 'Compact']].to_csv('docked_oneletters.csv', index=False)
df

In [0]:
#@title Chain preprocessing

dfc = pd.read_csv('docked_oneletters.csv')
dfc.Compact = dfc.Compact.str.strip('X')
dfc = dfc[~dfc.Compact.str.contains('X')]
dfc = dfc[dfc.Compact.str.len() > 0]
dfc = dfc.dropna()

data = []
for i, row in dfm.iterrows():
  if not pd.notnull(row.antigen_chain):
    continue

  dfc_rel = dfc[(dfc.File == row.file) & (dfc.Model == row.model)].set_index('Key')

  achains = [ak.strip() or '-' for ak in row.antigen_chain.split('|')]

  lk = row.Lchain if pd.notnull(row.Lchain) else '-'
  hk = row.Hchain if pd.notnull(row.Hchain) else'-'

  L = dfc_rel.loc[lk].Compact if lk in dfc_rel.index else '-'
  H = dfc_rel.loc[hk].Compact if hk in dfc_rel.index else '-'

  if H == '-' and L == '-':
    continue

  for ak in achains:
    A = dfc_rel.loc[ak].Compact if ak in dfc_rel.index else '-'

    data.append(dict(
        File=row.file,
        Model=row.model,
        LKey=lk,
        HKey=hk,
        AKey=ak,
        Light=L,
        Heavy=H,
        Antigen=A,
    ))

clear_output()
df = pd.DataFrame(data)
df = df.dropna()
df = df.drop_duplicates(['Light', 'Heavy', 'Antigen']) ##TODO drop by resolution
df = df.sort_values(['File', 'Model'])
df.to_csv('docked_preprocessed.csv', index=False)
df

In [0]:
#@title Contact point calculation

import numpy.linalg as lin

df = pd.read_csv('docked_preprocessed.csv')

contact_dist_threshold = 4

def append_contact_point():
  global data, ct
  ct += 1
  data.append({
      'File': filename,
      'Model': model.id,
      'BType': btype,
      'BKey': cb.id,
      'BIndex': y,
      'BResidue': rlb[y].resname,
      'AKey': ca.id,
      'AIndex': x,
      'AResidue': rla[x].resname,
  })

data = []
for filename, dff in df.groupby('File'):
  structure = parse(f'{pdb_dirname}/{filename}')
  
  models = list(structure.get_models())

  for i, row in dff.iterrows():
    model = models[row.Model]
    chain_map = {chain.id: chain for chain in model.get_chains()}

    H = chain_map.get(row.HKey)
    L = chain_map.get(row.LKey)
    A = chain_map.get(row.AKey)

    if A is None:
      continue

    print()
    # print('>>', filename, model.id, [c.id for c in chains])
    
    ca = A
    for btype, cb in (('H', H), ('L', L)):
      if cb is None:
        continue
      
      rla = list(ca)
      rlb = list(cb)

      rxa = [[a.coord for a in r] for r in rla]
      rxb = [[a.coord for a in r] for r in rlb]

      mx = max(*(len(x) for x in rxa), *(len(x) for x in rxb))
      ct = 0

      size = len(rxa) * len(rxb) * mx ** 2
      if size > 1e8:
        print('Using low memory variant')
        
        for x, xa in enumerate(rxa):
          xa = np.array(xa)
          for y, xb in enumerate(rxb):
            norms = lin.norm(xa[:, None] - xb, axis=2)
            if np.any((norms != 0) & (norms <= contact_dist_threshold)):
              append_contact_point()
      else:
        axa = np.zeros((len(rxa) * mx, 3))
        axb = np.zeros((len(rxb) * mx, 3))

        for i, x in enumerate(rxa):
          axa[i * mx:i * mx + len(x)] = x
        for i, x in enumerate(rxb):
          axb[i * mx:i * mx + len(x)] = x

        norms = lin.norm(axa[:, None] - axb, axis=2)
        locs = np.argwhere((norms != 0) & (norms <= contact_dist_threshold)) // mx
        if len(locs):
          for x, y in np.unique(locs, axis=0):
            append_contact_point()
      
      print(row.File, ct)

clear_output()
dfx = pd.DataFrame(data)
dfx.to_csv('chain_contacts.csv', index=False)
dfx

In [0]:
#@title Contact point preprocessing

dfo = pd.read_csv('docked_oneletters.csv')
dfp = pd.read_csv('docked_preprocessed.csv')
dfx = pd.read_csv('chain_contacts.csv')

df = dfx.merge(dfp, on=['File', 'Model', 'AKey'])

for col in ('Antigen', 'Heavy', 'Light'):
  ckey = col[0] + 'Key'
  df = df.drop(columns=col).merge(dfo.rename(columns={'Key': ckey}), on=['File', 'Model', ckey]).rename(columns={'Compact': col})

df['BWindow'] = df.apply(
    lambda x: (x.Heavy if x.BType == 'H' else x.Light)[x.BIndex - contact_buffer:x.BIndex + contact_buffer + 1],
    axis=1)
df['AWindow'] = df.apply(
    lambda x: x.Antigen[x.AIndex - contact_buffer:x.AIndex + contact_buffer + 1],
    axis=1)

dfs = df.drop(columns=['Antigen', 'Light', 'Heavy'])
dfs = dfs.dropna()
dfs.to_csv('contacts_preprocessed.csv', index=False)
dfs

In [0]:
#@title Contact point refinement

df = pd.read_csv('contacts_preprocessed.csv')
df = df[(df.BWindow.str.len() == contact_window_size) & (df.AWindow.str.len() == contact_window_size)]
df = df[~df.BWindow.str.contains('X') & ~df.AWindow.str.contains('X')]
df.to_csv('contacts_filtered.csv', index=False)
df

In [0]:
#@title Secondary structure calculation

df = pd.read_csv('docked_preprocessed.csv')

p_dfs = None
files = sorted(set(df.File))
for file in files:
  print(file)
  try:
    dfs = run_dssp(f'{pdb_dirname}/{file}')
    dfs.insert(0, 'File', file)
    p_dfs = p_dfs.append(dfs) if p_dfs is not None else dfs
  except Exception as e:
    print(e)

clear_output()
p_dfs.to_csv('dssp_residues.csv', index=False)
p_dfs

In [0]:
#@title Primary / secondary structure coalescence

dfs = pd.read_csv('dssp_residues.csv')

dfd = pd.read_csv('docked_oneletters.csv')
dfd['Compact'] = dfd.Compact.str.strip('X')
dfd = dfd[~dfd.Compact.str.contains('X')]
# dfd = dfd.set_index(['File', 'Model', 'Key'])

nskipped = 0

data = []
for (file, mid, key), secondary in dfs.groupby(['File', 'Model', 'Key']):
  # print(file)##

  dfdr = dfd[(dfd.File == file) & (dfd.Model == model) & (dfd.Key == key)]
  if len(dfdr) != 1:
    print(f'{len(dfdr)} candidates for {key} in model {model}')
    nskipped += 1
    continue

  seq = dfdr.iloc[0].Compact

  # print(seq)
  # print(''.join(secondary.Residue))

  if seq != ''.join(secondary.Residue):
    print(f'Residue mismatch for {file}')
    # print(seq)#
    # print(''.join(secondary.Residue))#
    nskipped += 1
    continue
  
  data.append(dict(
      File=file,
      Model=mid,
      Key=key,
      Compact=seq,
      SS='|'.join(secondary.SS),
      ASA='|'.join(str(round(f, 6)) for f in secondary.ASA),
      Phi='|'.join(str(round(f, 6)) for f in secondary.Phi),
      Psi='|'.join(str(round(f, 6)) for f in secondary.Psi),
  ))

clear_output()
df = pd.DataFrame(data)
df.to_csv('docked_secondary.csv', index=False)
df

In [0]:
#@title ANARCI installation

!apt install hmmer
!tar xkf anarci-1.3.tar.gz
cmd('cd anarci-1.3/ && python2 setup.py install')
clear_output()

In [0]:
#@title Sequence numbering

%%python2

from IPython.display import clear_output, display
import json
import pandas as pd

import anarci as an

options = dict(
    scheme='martin',
    assign_germline=True,
    allowed_species='human'))

type_map = dict(H='heavy', L='light')

df = pd.read_csv('docked_preprocessed.csv')

adfr = []
adfa = []
adft = []

for i, row in df.iterrows():
  filename = row.File
  mid = row.Model
  for key, seq, btype in [(row.HKey, row.Heavy, 'H'), (row.LKey, row.Light, 'L')]:
    seqs = [(key, seq)]
    
    # print filename[:-4], mid, key

    results = list(zip(*an.run_anarci(seqs, allow=type_map[btype], **options)))
    if any(not ns for _, ns, _, _ in results):
      results = list(zip(*an.run_anarci(seqs, allow='ig', **options)))
    if any(not ns for _, ns, _, _ in results):
      results = list(zip(*an.run_anarci(seqs, **options)))

    for (key, seq), numbers, alignments, hit_tables in results:

      if numbers:
        dfr = pd.DataFrame(dict(
            Res=res,
            Num=num,
            Sub=sub.strip() or '-',
        ) for i, (residues, start, end) in enumerate(numbers) for (num, sub), res in residues)
        dfr.insert(0, 'File', filename)
        dfr.insert(1, 'Model', mid)
        dfr.insert(2, 'Key', key)
        dfr.insert(3, 'Type', btype)
        adfr.append(dfr)

      if alignments:
        dfa = pd.DataFrame(alignments)
        dfa.insert(0, 'File', filename)
        dfa.insert(1, 'Model', mid)
        adfa.append(dfa)

      if hit_tables:
        dft = pd.DataFrame(hit_tables[1:], columns=hit_tables[0])
        dft.insert(0, 'File', filename)
        dft.insert(1, 'Model', mid)
        dft.insert(2, 'Key', key)
        dft.insert(3, 'Type', btype)
        adft.append(dft)

clear_output()
pd.concat(adfr).to_csv('anarci_residues.csv', index=False)
pd.concat(adfa).to_csv('anarci_alignments.csv', index=False)
pd.concat(adft).to_csv('anarci_hit_tables.csv', index=False)

In [0]:
#@title CDR extraction

def cdr_range(df, btype, start, end):
  return (df.Type == btype) & (df.Num >= start) & (df.Num <= end)

df = pd.read_csv('anarci_residues.csv')
df['CDR'] = '-'

dfL = df[df.Type == 'L']
dfH = df[df.Type == 'H']

df.loc[cdr_range(df, 'L', 30, 36), 'CDR'] = 'L1'
df.loc[cdr_range(df, 'L', 46, 55), 'CDR'] = 'L2'
df.loc[cdr_range(df, 'L', 89, 96), 'CDR'] = 'L3'

df.loc[cdr_range(df, 'H', 30, 35), 'CDR'] = 'H1'
df.loc[cdr_range(df, 'H', 47, 58), 'CDR'] = 'H2'
df.loc[cdr_range(df, 'H', 93, 101), 'CDR'] = 'H3'

df.to_csv('cdr_martin_residues.csv', index=False)
df

In [0]:
#@title CDR preprocessing

dfr = pd.read_csv('cdr_martin_residues.csv')

# display(dfm[dfm.pdb == '1a14'])
# display(dfr[:500:10])

data = []
for (file, model, key, btype), dff in dfr.groupby(['File', 'Model', 'Key', 'Type']):

  dff = dff.reset_index(drop=True)
  item = dict(
      File=file,
      Model=model,
      Key=key,
      Type=btype,
  )
  data.append(item)

  for bnum in (1, 2, 3):
    cdr = f'{btype}{bnum}'
    dffc = dff[dff.CDR == cdr]
    item[f'Seq{bnum}'] = ''.join(dffc.Res)
    item[f'Start{bnum}'] = dffc.index[0]
    item[f'EndInc{bnum}'] = dffc.index[-1]

df = pd.DataFrame(data)
# df = df.sort_values(['File', 'Model'])###
# df = df.drop_duplicates(['Seq1', 'Seq2', 'Seq3'])
df.to_csv('cdr_preprocessed.csv', index=False)
df

In [0]:
#@title CDR flattening

dfp = pd.read_csv('cdr_preprocessed.csv')

adff = []
for bnum in (1, 2, 3):
  dff = dfp[f'File,Model,Key,Type,Seq{bnum},Start{bnum},EndInc{bnum}'.split(',')]
  dff = dff.rename(columns={
      f'Seq{bnum}': 'Seq',
      f'Start{bnum}': 'Start',
      f'EndInc{bnum}': 'EndInc',
  })
  dff.insert(4, 'Region', bnum)
  adff.append(dff)

df = pd.concat(adff)
df = df.sort_values(['File', 'Model', 'Type', 'Region'])
df.to_csv('cdr_flattened.csv', index=False)
df

In [0]:
#@title CDR contact point refinement

dfc = pd.read_csv('contacts_filtered.csv')
dfr = pd.read_csv('cdr_preprocessed.csv')

df = dfc.merge(dfr.rename(columns={'Key': 'BKey', 'Type': 'BType'}), on=['File', 'Model', 'BKey', 'BType'])

df['CDR'] = 0
df['CDR Start'] = 0
df['CDR EndInc'] = 0
for bnum in(1, 2, 3):
  cond = (df.BIndex >= df[f'Start{bnum}']) & (df.BIndex <= df[f'EndInc{bnum}'])
  df.loc[cond, 'CDR'] = bnum
  df.loc[cond, 'CDR Start'] = df[f'Start{bnum}']
  df.loc[cond, 'CDR EndInc'] = df[f'EndInc{bnum}']
  df = df.drop(columns=[f'Seq{bnum}', f'Start{bnum}', f'EndInc{bnum}'])

df = df[df.CDR != 0]
df.to_csv('contacts_cdr_filtered.csv', index=False)
df

In [0]:
#@title VH-VL orientation calculation

!wget -nc -O ABangle.tar.gz http://opig.stats.ox.ac.uk/webapps/newsabdab/sabdab/downloads/abangle/
!tar xsf ABangle.tar.gz
!wget -nc https://zhanglab.ccmb.med.umich.edu/TM-align/TMalign.cpp
!g++ -static -O3 -ffast-math -lm -o TMalign TMalign.cpp
!mkdir -p /opt/bin
!mv ./TMalign /opt/bin
clear_output()

!pdb_selchain -H,L all_structures/chothia/5ukq.pdb > AB.pdb
!python2 ./ABangle/ABangle -i AB.pdb -usernumbered -target 5ukq -store n -png output.png -showinfo -msa -mr -seqid

In [0]:
# @title Feature assembly

dfs = pd.read_csv('dssp_residues.csv')
dfc = pd.read_csv('contacts_cdr_filtered.csv')

data = []
for (file, model, bkey, akey), dffc in dfc.groupby(['File', 'Model', 'BKey', 'AKey']):
  
  dffs = dfs[(dfs.File == file) & (dfs.Model == model)]

  b_dffs = dffs[dffs.Key == bkey]
  a_dffs = dffs[dffs.Key == akey]

  print(file, len(dffc), len(dffs))
  # print(file[:-4])

  for i, row in dffc.iterrows():
    bi = row.BIndex
    b_secondary = b_dffs.iloc[bi - contact_buffer:bi + contact_buffer + 1]
    ai = row.AIndex###
    a_secondary = a_dffs.iloc[ai - contact_buffer:ai + contact_buffer + 1]
    
    if len(b_secondary) != contact_window_size or len(a_secondary) != contact_window_size:
      print('Length mismatch')
      continue
    
    if not np.all((b_secondary.Residue.values == list(row.BWindow)) & (a_secondary.Residue.values == list(row.AWindow))):
      print('Residue mismatch')
      continue

    data.append(dict(
        File=file,
        Model=model,
        Type=row.BType,
        CDR=row.CDR,
        CDR_S=row['CDR Start'],
        CDR_EI=row['CDR EndInc'],
        BKey=bkey,
        BIndex=bi,
        BWindow=row.BWindow,
        BSec='|'.join(b_secondary.SS),
        BSol='|'.join(str(round(f, 6)) for f in b_secondary.ASA),
        AKey=row.AKey,
        AIndex=ai,
        AWindow=row.AWindow,
        ASec='|'.join(a_secondary.SS),
        ASol='|'.join(str(round(f, 6)) for f in a_secondary.ASA),
    ))

# dfmg = dfm.rename(columns={'file': 'File','model': 'Model'})
# dfmg['resolution'] = pd.to_numeric(dfmg.resolution, errors='coerce')
# df['Res'] = dfmg.resolution

clear_output()
df = pd.DataFrame(data)
df = df[(df.BWindow.str.len() == contact_window_size) & (df.AWindow.str.len() == contact_window_size)]
df = df.dropna()
df = df.drop_duplicates(['BWindow', 'AWindow'])
df.to_csv('features_contacts.csv', index=False)
df

In [0]:
#@title CDR window collection

dfc = pd.read_csv('cdr_flattened.csv')

dfd = pd.read_csv('docked_secondary.csv')
dfd = dfd[dfd.Compact.str.len() >= 9]

dfcd = dfc.merge(dfd, on=['File', 'Model', 'Key'])

data = []
for i, row in dfcd.iterrows():
  if i % 100 == 0:
    print(row.File)

  for index in range(row.Start, row.EndInc + 1):
    window = row.Compact[index - contact_buffer:index + contact_buffer + 1]
    ss_window = '|'.join(row.SS.split('|')[index - contact_buffer:index + contact_buffer + 1])
    asa_window = '|'.join(row.ASA.split('|')[index - contact_buffer:index + contact_buffer + 1])

    data.append(dict(
        File=row.File,
        Model=row.Model,
        Type=row.Type,
        CDR=row.Region,
        CDR_S=row.Start,
        CDR_EI=row.EndInc,
        BKey=row.Key,
        BIndex=index,
        BWindow=window,
        BSec=ss_window,
        BSol=asa_window,
    ))

clear_output()
df = pd.DataFrame(data)
df = df[df.BWindow.str.len() == contact_window_size]
df = df.dropna()
df.to_csv('windows_cdr.csv', index=False)
df

In [0]:
#@title Antigen window collection

dfd = pd.read_csv('docked_secondary.csv')
dfd = dfd[dfd.Compact.str.len() >= 9]

for file, model, hk, lk in dfm[['file', 'model', 'Hchain', 'Lchain']].sort_values('file').values:
  print(file)

  dfd = dfd[(dfd.File != file) | (dfd.Model != model) | ~dfd.Key.isin((hk, lk))]

  print(len(dfd))

data = []
for i, row in dfd.iterrows():
  if i % 100 == 0:
    print(row.File)

  for index in range(contact_buffer, len(row.Compact) - contact_buffer):
    window = row.Compact[index - contact_buffer:index + contact_buffer + 1]
    ss_window = '|'.join(row.SS.split('|')[index - contact_buffer:index + contact_buffer + 1])
    asa_window = '|'.join(row.ASA.split('|')[index - contact_buffer:index + contact_buffer + 1])

    data.append(dict(
        File=row.File,
        Model=row.Model,
        AKey=row.Key,
        AIndex=index,
        AWindow=window,
        ASec=ss_window,
        ASol=asa_window,
    ))

clear_output()
df = pd.DataFrame(data)
df = df[df.AWindow.str.len() == contact_window_size]
df = df.dropna()
# df = df.drop_duplicates(['BWindow', 'AWindow'])
df.to_csv('windows_ag.csv', index=False)
df

In [0]:
#@title Full window collection

dfd = pd.read_csv('docked_secondary.csv')
dfd = dfd[dfd.Compact.str.len() >= 9]

data = []
for i, row in dfd.iterrows():
  if i % 100 == 0:
    print(row.File)

  for index in range(contact_buffer, len(row.Compact) - contact_buffer):
    window = row.Compact[index - contact_buffer:index + contact_buffer + 1]
    ss_window = '|'.join(row.SS.split('|')[index - contact_buffer:index + contact_buffer + 1])
    asa_window = '|'.join(row.ASA.split('|')[index - contact_buffer:index + contact_buffer + 1])

    data.append(dict(
        File=row.File,
        Model=row.Model,
        AKey=row.Key,
        AIndex=index,
        AWindow=window,
        ASec=ss_window,
        ASol=asa_window,
    ))

clear_output()
df = pd.DataFrame(data)
df = df[df.AWindow.str.len() == contact_window_size]
df = df.dropna()
# df = df.drop_duplicates(['BWindow', 'AWindow'])
df.to_csv('windows_all.csv', index=False)
df