### Install packages required for 3d protein viz within Jupyter (`py3dmol` library)

Stop the jupyter server. Run the following commands in the shell:

**Note**: you might need to request admin privileges to run the following commands

> pip install py3Dmol jupyterlab-widgets 

> brew install node

> jupyter labextension install jupyterlab_3dmol

Then start the server again.

### Load SASA prediction data you want to visualize

In [16]:
import io
import numpy as np
import pandas as pd
import py3Dmol

from Bio import PDB

STRUCTURE_CODE = '6A78'
CHAIN_ID = 'L'

Load the entire baseline predictions dataset:

In [17]:
val_baseline_results_df = pd.read_csv('../../data/csv/val_baseline_results.csv', index_col=0)
val_baseline_results_df

Unnamed: 0,id,model,chain,position,region,sasa,predicted,error,abs_error
0,5NJD:L,avg_by_pos,L,2,,100.0,15.549053,84.450947,84.450947
1,5NJD:L,avg_by_pos,L,3,,71.7,59.856361,11.843639,11.843639
2,5NJD:L,avg_by_pos,L,4,,1.3,6.626722,-5.326722,5.326722
3,5NJD:L,avg_by_pos,L,5,,61.4,53.772446,7.627554,7.627554
4,5NJD:L,avg_by_pos,L,6,,3.4,8.587328,-5.187328,5.187328
...,...,...,...,...,...,...,...,...,...
303915,6VCA:L,knn,L,145,,3.2,1.566667,1.633333,1.633333
303916,6VCA:L,knn,L,146,,3.7,13.200000,-9.500000,9.500000
303917,6VCA:L,knn,L,147,,0.7,4.833333,-4.133333,4.133333
303918,6VCA:L,knn,L,148,,40.0,21.700000,18.300000,18.300000


Extract the relevant predictions as `pandas.Series` object:

In [18]:
MODEL_NAME = 'median_by_same_res_pos'
def extract_sasa_from_dataframe(df: pd.DataFrame, 
                                structure: str, 
                                chain: str, 
                                model: str) -> pd.Series:
    
    subdf = df[(df['id'] == f'{structure.upper()}:{chain}') & (df['model'] == model)]
    return subdf['sasa']

In [19]:
prediction_series = extract_sasa_from_dataframe(val_baseline_results_df,
                                      STRUCTURE_CODE,
                                      CHAIN_ID,
                                      MODEL_NAME)
prediction_series

153998     75.0
153999      5.9
154000     63.2
154001      8.5
154002     69.7
          ...  
154101      3.6
154102     43.8
154103     16.9
154104     65.9
154105    100.0
Name: sasa, Length: 108, dtype: float64

### Prepare special PDB file

`py3Dmol` library - which is used for 3d interactive visualization of protein structures - takes as an input the PDB file containing the sequence you are going to visualize.

Perform the following steps:

1. remove all the other sequences/chains from the PDB file 
2. run the following cell containing function definitions 

In [20]:
atom_types = [
    'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
    'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
    'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
    'CZ3', 'NZ', 'OXT'
]
atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
atom_type_num = len(atom_types)  # := 37.

def overwrite_b_factors(pdb_str: str, bfactors: np.ndarray) -> str:
  """Overwrites the B-factors in pdb_str with contents of bfactors array.
  Args:
    pdb_str: An input PDB string.
    bfactors: A numpy array with shape [1, n_residues, 37]. We assume that the
      B-factors are per residue; i.e. that the nonzero entries are identical in
      [0, i, :].
  Returns:
    A new PDB string with the B-factors replaced.
  """

  if bfactors.shape[-1] != atom_type_num:
    raise ValueError(
        f'Invalid final dimension size for bfactors: {bfactors.shape[-1]}.')

  parser = PDB.PDBParser(QUIET=True)
  handle = io.StringIO(pdb_str)
  structure = parser.get_structure('', handle)

  curr_resid = ('', '', '')
  idx = -1
  for atom in structure.get_atoms():
    atom_resid = atom.parent.get_id()
    if atom_resid != curr_resid:
      idx += 1
      if idx >= bfactors.shape[0]:
        raise ValueError('Index into bfactors exceeds number of residues. '
                         'B-factors shape: {shape}, idx: {idx}.')
    curr_resid = atom_resid
    atom.bfactor = bfactors[idx, atom_order['CA']]

  new_pdb = io.StringIO()
  pdb_io = PDB.PDBIO()
  pdb_io.set_structure(structure)
  pdb_io.save(new_pdb)
  return new_pdb.getvalue()


def get_pdb_contents(data: pd.Series, structure: str, chain: str, model: str):
    b_factor_data = []
    for _index, sasa in data.iteritems():
        b_factor_data.append(np.array([sasa // 20] * 37))
    b_factor_data = np.array(b_factor_data)
    print(b_factor_data.shape)

    pdb_string = open(f'../../data/pdb/{structure}.pdb').read()
    pdb = overwrite_b_factors(pdb_string, b_factor_data)
    
    return pdb

### Visualize

In [22]:

pdb = get_pdb_contents(prediction_series, STRUCTURE_CODE, CHAIN_ID, MODEL_NAME) 

# Color the structure
show_sidechains = True
color_map = { 
    0: 'blue', 
    1: '#00FF33',
    2: '#00CC33',
    3: '#006600',
    4: '#333300', 
    5: 'red'
}

view = py3Dmol.view(width=800, height=600)
view.addModelsAsFrames(pdb)
style = {'cartoon': {'colorscheme': {'prop': 'b', 'map': color_map}}}
if show_sidechains:
  style['stick'] = {'colorscheme': {'prop': 'b', 'map': color_map}}
view.addSurface(py3Dmol.SAS, {'opacity': 0.5})
view.setStyle({'model': -1}, style)
view.zoomTo()

(108, 37)


<py3Dmol.view at 0x7fcf378906a0>