# Analysis of EGFR

In [1]:
import sys, os, pickle, csv, glob
import numpy as np

In [2]:
# Insert here local path to Rhapsody folder
sys.path.insert(0, '../../rhapsody/')

In [3]:
from rhapsody import *

## Integrated Dataset
Let's import the training dataset. We will only consider variants in the Integrated Dataset with at least 1 ClinVar review star, if present, and an associated PDB structure larger than 150 residues, two restrictions that we found to improve prediction accuracy. 

In [4]:
ID = np.load('../00-Training_Dataset/data/precomputed_features-ID.npy')

ID = ID[ID['true_label'] != -1]
len(ID)

87726

In [5]:
ID_SAVs_info = np.load('../00-Training_Dataset/data/Integrated_Dataset-SAVs.npy')
zero_star_SAVs = ID_SAVs_info[ ID_SAVs_info['ClinVar_review_star'] == 0 ]['SAV_coords']

ID = ID[ [SAV not in zero_star_SAVs for SAV in ID['SAV_coords']] ]
len(ID)

80215

In [6]:
ID = ID[ID['PDB_length'] >= 150]
len(ID)

20361

## Re-training of unbiased classifier

A few EGFR mutations are found in the Integrated Dataset used for training. In order to get completely unbiased predictions, we will retrain a classifier by excluding those variants from the training dataset.

**NB:** The Uniprot names for gene EGFR are `P00533` or `EGFR_HUMAN`.

In [7]:
known_EGFR_SAVs = ID_SAVs_info[ [SAV.startswith('P00533') for SAV in ID_SAVs_info['SAV_coords']] ]

known_del_SAVs = known_EGFR_SAVs[ known_EGFR_SAVs['true_label'] == 1 ]
known_neu_SAVs = known_EGFR_SAVs[ known_EGFR_SAVs['true_label'] == 0 ]

print(f'{len(known_del_SAVs)} known deleterious EGFR SAVs:')
print(known_del_SAVs)
print(f'\n{len(known_neu_SAVs)} known neutral EGFR SAVs:')
print(known_neu_SAVs)

4 known deleterious EGFR SAVs:
[('P00533 428 G D', 1, 'humsavar[1],clinvar[1]',  0)
 ('P00533 748 R T', 1, 'swissvar[1]', -1)
 ('P00533 787 Q R', 1, 'varibench[1]', -1)
 ('P00533 873 G E', 1, 'varibench[1]', -1)]

12 known neutral EGFR SAVs:
[('P00533 1034 L R', 0, 'humsavar[0],swissvar[0]', -1)
 ('P00533 1048 A V', 0, 'exovar[0]', -1)
 ('P00533 1210 A V', 0, 'humsavar[0],swissvar[0]', -1)
 ('P00533 266 P R', 0, 'humsavar[0],swissvar[0]', -1)
 ('P00533 521 R K', 0, 'humsavar[0],humvar[0],clinvar[0]',  2)
 ('P00533 674 V I', 0, 'humsavar[0],swissvar[0]', -1)
 ('P00533 769 V M', 0, 'humsavar[0],clinvar[0]',  0)
 ('P00533 833 L V', 0, 'humsavar[0],clinvar[0]',  0)
 ('P00533 838 L V', 0, 'humsavar[0],clinvar[0]',  0)
 ('P00533 962 R G', 0, 'humsavar[0]', -1)
 ('P00533 98 R Q', 0, 'humsavar[0]', -1)
 ('P00533 988 H P', 0, 'exovar[0],humsavar[0],humvar[0],clinvar[0]',  1)]


Let's exclude these variants from the dataset. 

In [8]:
ID_subset = ID[ [not SAV.startswith('P00533') for SAV in ID['SAV_coords']] ]
len(ID_subset)

20353

We use this subset of the Integrated Dataset to train unbiased versions of Rhapsody classifiers:

In [9]:
featsets = {
    'full_clsf': [ 'wt_PSIC', 'Delta_PSIC', 'SASA', 
                   'ANM_MSF-chain', 'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
                   'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM' ],
    'redx_clsf': [ 'wt_PSIC', 'Delta_PSIC', 'SASA', 
                   'ANM_MSF-chain', 'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
                   'stiffness-chain', 'BLOSUM' ]
}

In [10]:
from prody import LOGGER

if os.path.isdir('results'):
    print('Classifiers already trained.')
else:
    os.mkdir('results/')
    
    LOGGER.start('results/RF_training.log')
    summaries = {}
    
    for clsf_version, featset in featsets.items():
        folder = f'results/{clsf_version}'
        os.mkdir(folder)
        
        f = ['SAV_coords', 'true_label'] + featset
        output_dict = trainRFclassifier(ID_subset[f])
        summaries[clsf_version] = output_dict['CV summary']
        
        for file in glob.glob('*png') + ['trained_classifier.pkl',]:
            os.rename(file, os.path.join(folder, file))
            
        LOGGER.info('')
                    
    # store training summary into pickle
    pickle.dump(summaries, open('results/summaries.pkl', 'wb'))

    LOGGER.close('results/RF_training.log')

Classifiers already trained.


## Rhapsody predictions
We perform a complete scanning of all amino acid variants (*in silico* saturation mutagenesis).

**NB:** PolyPhen-2 predictions are precomputed and saved in `data/pph2`

In [11]:
if os.path.isdir('results/predictions'):
    print('predictions already precomputed')
    rh = pickle.load(open('results/predictions/rhapsody-pickle.pkl', 'rb'))
else:
    os.mkdir('results/predictions')
    # run rhapsody
    rh = rhapsody('data/pph2/pph2-full.txt', 'results/full_clsf/trained_classifier.pkl',
                  aux_classifier='results/redx_clsf/trained_classifier.pkl', input_type='PP2')
    # store files
    for f in glob.glob('rhapsody-*.*'):
        os.rename(f, os.path.join('results/predictions', f))

predictions already precomputed


## Effect of dimerization on predictions: the kinase domain
We will compare predictions obatined by using a custom PDB structure with those obtained automatically by Rhapsody. 
In particular we will consider the *biological assembly* for the EGFR kinase domain, which is an asymmetric dimer that can be found in the PDB database. 
We will also consider different ways of including environmental effects (*reduced* vs *sliced* models).

In [12]:
dimer_pdb_file = 'data/2gs6-dimer.pdb'

In [13]:
rh_dimer = {}

for env_model in ['chain', 'reduced', 'sliced']:
    folder = f'results/predictions_dimer-{env_model}'
    if os.path.isdir(folder):
        print('predictions already precomputed')
        rh_dimer[env_model] = pickle.load(open(os.path.join(folder, 'rhapsody-pickle.pkl'), 'rb'))
    else:
        os.mkdir(folder)
        # run rhapsody
        _r = rhapsody('data/pph2/pph2-full.txt', 'results/full_clsf/trained_classifier.pkl',
                      aux_classifier='results/redx_clsf/trained_classifier.pkl', input_type='PP2',
                      custom_PDB=dimer_pdb_file, force_env=env_model)
        rh_dimer[env_model] = _r
        # store files
        for f in glob.glob('rhapsody-*.*'):
            os.rename(f, os.path.join(folder, f))

predictions already precomputed
predictions already precomputed
predictions already precomputed


In [14]:
# correlation between predictions

from scipy.stats.stats import spearmanr

sel1 = ~np.isnan(rh_dimer['chain'].mixPreds['score'])
sel2 = [x.startswith('3GOP') for x in rh.Uniprot2PDBmap['PDB SAV coords']]
sel = np.logical_and(sel1, sel2)

pred_sets = [('Rhapsody', rh), ('chain', rh_dimer['chain']), 
             ('reduced', rh_dimer['reduced']), ('sliced', rh_dimer['sliced'])]

print(" "*11 + f"Rhapsody   chain    reduced   sliced")

for (s_i, r_i) in pred_sets:
    print(f'{s_i:12}', end='')
    for (s_j, r_j) in pred_sets:
        rho = spearmanr(r_i.mixPreds['score'][sel], r_j.mixPreds['score'][sel])
        print(f'{rho[0]:5.3f}', end=' '*5)
    print()


           Rhapsody   chain    reduced   sliced
Rhapsody    1.000     0.734     0.732     0.738     
chain       0.734     1.000     0.986     0.935     
reduced     0.732     0.986     1.000     0.932     
sliced      0.738     0.935     0.932     1.000     


In [15]:
# comparison with 'true_labels' found in the Integrated Dataset

abbrv = {'?': '?', 'neutral': 'neu', 'prob.neutral': 'p.neu',
         'deleterious': 'del', 'prob.deleterious': 'p.del'}

print(f"SAV                Rhapsody/chain/reduced/sliced   true_labels")

for SAV in known_EGFR_SAVs:
    for i, s in enumerate(rh.SAVcoords['text']):
        # Rhapsody predictions
        preds = ""
        if s == SAV['SAV_coords']:
            for r in [rh, rh_dimer['chain'], rh_dimer['reduced'], rh_dimer['sliced']]:
                p = r.mixPreds[i]['path. class']
                preds += f"{abbrv[p]:6} " 
            if SAV['ClinVar_review_star'] != -1:
                stars = "*" * SAV['ClinVar_review_star']
            else:
                stars = ""
            print(f"{SAV['SAV_coords']:20} {preds} {SAV['true_label']:2}  {SAV['datasets']}{stars}" )

SAV                Rhapsody/chain/reduced/sliced   true_labels
P00533 1034 L R      ?      ?      ?      ?        0  humsavar[0],swissvar[0]
P00533 1048 A V      ?      ?      ?      ?        0  exovar[0]
P00533 1210 A V      ?      ?      ?      ?        0  humsavar[0],swissvar[0]
P00533 266 P R       neu    ?      ?      ?        0  humsavar[0],swissvar[0]
P00533 428 G D       del    ?      ?      ?        1  humsavar[1],clinvar[1]
P00533 521 R K       neu    ?      ?      ?        0  humsavar[0],humvar[0],clinvar[0]**
P00533 674 V I       neu    ?      ?      ?        0  humsavar[0],swissvar[0]
P00533 709 E A       del    neu    p.neu  p.neu   -1  humsavar[0],varibench[1],clinvar[0]*
P00533 709 E G       del    del    del    del     -1  humsavar[0],swissvar[1],clinvar[0]*
P00533 709 E K       del    neu    neu    neu     -1  humsavar[0],varibench[1],clinvar[0]
P00533 719 G A       del    del    del    del     -1  humsavar[0],varibench[1],clinvar[1]*
P00533 719 G C       del    del  

In the following, we plot the average predictions on the PDB structures to highlight the differences between the various approaches.

In [16]:
if not os.path.isdir('figures'):
    os.mkdir('figures')

In [17]:
from prody import *

for case in ['Rhapsody', 'chain', 'reduced', 'sliced']:
    if case == 'Rhapsody':
        PDBID = '3GOP'
        r = rh
        pdb = parsePDB('3GOP')
    else:
        PDBID = '2gs6'
        r = rh_dimer[case]
        pdb = parsePDB(dimer_pdb_file)
        
    probs = {} 
    for U2PDBmap, pred in zip(r.Uniprot2PDBmap['PDB SAV coords'], 
                              r.mixPreds['path. probability']):
        if not U2PDBmap.startswith(PDBID):
            continue
        res = int(U2PDBmap.split()[2])
        probs.setdefault(res, [])
        probs[res].append(pred)

    PDBresids = pdb.getResnums()
    new_betas = np.zeros_like(PDBresids, dtype=float)
    for i, res in enumerate(PDBresids):
        if res in probs:
            x = np.nanmean(probs[res])
            beta = -1 if np.isnan(x) else x
        else:
            beta = -1
        new_betas[i] = beta
    # write modified PDB
    pdb.setBetas(new_betas)
    f = writePDB(f'figures/{case}-mapped', pdb)

@> PDB file is found in the local folder (/home/lponzoni/.../3gop.pdb.gz).
@> 2385 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> 9984 atoms and 1 coordinate set(s) were parsed in 0.10s.
@> 9984 atoms and 1 coordinate set(s) were parsed in 0.09s.
@> 9984 atoms and 1 coordinate set(s) were parsed in 0.08s.
