# Analysis of EGFR mutants

In [1]:
import sys, os, pickle, csv, glob
import numpy as np

In [2]:
import prody

In [3]:
# If needed, insert here local path to Rhapsody folder with the command:
# sys.path.insert(0, '/LOCAL_PATH/rhapsody/')
import rhapsody as rhaps

## Integrated Dataset
Let's import the training dataset. We will only consider variants in the Integrated Dataset with at least 1 ClinVar review star, if present, and an associated PDB structure larger than 150 residues, two restrictions that we found to improve prediction accuracy (see tutorial `RF_optimization` for more details).

In [4]:
# import the numpy structured array containing precomputed features for the optimized training dataset
ID = np.load('../00-Training_Dataset/local/data/precomputed_features-ID_opt.npy')
ID.dtype

dtype([('SAV_coords', '<U50'), ('Uniprot2PDB', '<U100'), ('PDB_length', '<i2'), ('true_label', '<i2'), ('ANM_MSF-chain', '<f4'), ('ANM_MSF-reduced', '<f4'), ('ANM_MSF-sliced', '<f4'), ('ANM_effectiveness-chain', '<f4'), ('ANM_effectiveness-reduced', '<f4'), ('ANM_effectiveness-sliced', '<f4'), ('ANM_sensitivity-chain', '<f4'), ('ANM_sensitivity-reduced', '<f4'), ('ANM_sensitivity-sliced', '<f4'), ('BLOSUM', '<f4'), ('Delta_PSIC', '<f4'), ('Delta_SASA', '<f4'), ('EVmut-DeltaE_epist', '<f4'), ('EVmut-DeltaE_indep', '<f4'), ('EVmut-mut_aa_freq', '<f4'), ('EVmut-wt_aa_cons', '<f4'), ('GNM_MSF-chain', '<f4'), ('GNM_MSF-reduced', '<f4'), ('GNM_MSF-sliced', '<f4'), ('GNM_effectiveness-chain', '<f4'), ('GNM_effectiveness-reduced', '<f4'), ('GNM_effectiveness-sliced', '<f4'), ('GNM_sensitivity-chain', '<f4'), ('GNM_sensitivity-reduced', '<f4'), ('GNM_sensitivity-sliced', '<f4'), ('SASA', '<f4'), ('SASA_in_complex', '<f4'), ('entropy', '<f4'), ('ranked_MI', '<f4'), ('stiffness-chain', '<f4'), ('

## Re-training of unbiased classifier

A few EGFR mutations are found in the Integrated Dataset used for training. In order to get completely unbiased predictions, we will retrain a classifier by excluding those variants from the training dataset.

**NB:** The Uniprot ID for gene EGFR is `P00533`.

In [5]:
known_EGFR_SAVs = ID[ [SAV.startswith('P00533') for SAV in ID['SAV_coords']] ]

known_del_SAVs = known_EGFR_SAVs[ known_EGFR_SAVs['true_label'] == 1 ]
known_neu_SAVs = known_EGFR_SAVs[ known_EGFR_SAVs['true_label'] == 0 ]

print(f'{len(known_del_SAVs)} known deleterious EGFR SAVs:')
print(known_del_SAVs['SAV_coords'])
print(f'\n{len(known_neu_SAVs)} known neutral EGFR SAVs:')
print(known_neu_SAVs['SAV_coords'])

3 known deleterious EGFR SAVs:
['P00533 748 R T' 'P00533 787 Q R' 'P00533 873 G E']

5 known neutral EGFR SAVs:
['P00533 266 P R' 'P00533 521 R K' 'P00533 962 R G' 'P00533 98 R Q'
 'P00533 988 H P']


Let's exclude these variants from the dataset. 

In [6]:
ID_subset = ID[ [not SAV.startswith('P00533') for SAV in ID['SAV_coords']] ]
len(ID_subset)

20353

We use this subset of the Integrated Dataset to train unbiased versions of Rhapsody classifiers:

In [7]:
featsets = {
    'full_clsf': [ 'wt_PSIC', 'Delta_PSIC', 'SASA', 
                   'ANM_MSF-chain', 'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
                   'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM' ],
    'redx_clsf': [ 'wt_PSIC', 'Delta_PSIC', 'SASA', 
                   'ANM_MSF-chain', 'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
                   'stiffness-chain', 'BLOSUM' ]
}

In [8]:
if not os.path.isdir('local'):
    os.mkdir('local')

In [9]:
from prody import LOGGER

if os.path.isdir('local/results'):
    print('Classifiers already trained.')
else:
    os.mkdir('local/results/')
    
    LOGGER.start('local/results/RF_training.log')
    summaries = {}
    
    for clsf_version, featset in featsets.items():
        folder = f'local/results/{clsf_version}'
        os.mkdir(folder)
        
        f = ['SAV_coords', 'true_label'] + featset
        output_dict = rhaps.trainRFclassifier(ID_subset[f])
        summaries[clsf_version] = output_dict['CV summary']
        
        for file in glob.glob('*png') + ['trained_classifier.pkl',]:
            os.rename(file, os.path.join(folder, file))
            
        LOGGER.info('')
                    
    # store training summary into pickle
    pickle.dump(summaries, open('local/results/summaries.pkl', 'wb'))

    LOGGER.close('local/results/RF_training.log')

Classifiers already trained.


## Rhapsody predictions
We perform a complete scanning of all amino acid variants (*in silico* saturation mutagenesis).

**NB:** PolyPhen-2 predictions are precomputed and saved in `data/pph2`

In [10]:
if os.path.isdir('local/results/predictions'):
    print('Precomputed predictions found.')
    rh = pickle.load(open('local/results/predictions/rhapsody-pickle.pkl', 'rb'))
else:
    os.mkdir('local/results/predictions')
    # run rhapsody
    rh = rhaps.rhapsody('data/pph2/pph2-full.txt', 'local/results/full_clsf/trained_classifier.pkl',
                        aux_classifier='local/results/redx_clsf/trained_classifier.pkl', input_type='PP2')
    # store files
    for f in glob.glob('rhapsody-*.*'):
        os.rename(f, os.path.join('local/results/predictions', f))

Precomputed predictions found.


## Effect of dimerization on predictions: the kinase domain
We will compare predictions obatined by using a custom PDB structure with those obtained automatically by Rhapsody. 
In particular we will consider the *biological assembly* for the EGFR kinase domain, which is an asymmetric dimer that can be found in the PDB database. 
We will also consider different ways of including environmental effects (*reduced* vs *sliced* models).

In [11]:
dimer_pdb_file = 'data/2gs6-dimer.pdb'

In [12]:
rh_dimer = {}

for env_model in ['chain', 'reduced', 'sliced']:
    folder = f'local/results/predictions_dimer-{env_model}'
    if os.path.isdir(folder):
        print('Precomputed predictions found.')
        rh_dimer[env_model] = pickle.load(open(os.path.join(folder, 'rhapsody-pickle.pkl'), 'rb'))
    else:
        os.mkdir(folder)
        # run rhapsody
        _r = rhaps.rhapsody('data/pph2/pph2-full.txt', 'local/results/full_clsf/trained_classifier.pkl',
                            aux_classifier='local/results/redx_clsf/trained_classifier.pkl', input_type='PP2',
                            custom_PDB=dimer_pdb_file, force_env=env_model)
        rh_dimer[env_model] = _r
        # store files
        for f in glob.glob('rhapsody-*.*'):
            os.rename(f, os.path.join(folder, f))

Precomputed predictions found.
Precomputed predictions found.
Precomputed predictions found.


In [13]:
# correlation between predictions

from scipy.stats.stats import spearmanr

sel1 = ~np.isnan(rh_dimer['chain'].mixPreds['score'])
sel2 = [x.startswith('3GOP') for x in rh.Uniprot2PDBmap['PDB SAV coords']]
sel = np.logical_and(sel1, sel2)

pred_sets = [('Rhapsody', rh), ('chain', rh_dimer['chain']), 
             ('reduced', rh_dimer['reduced']), ('sliced', rh_dimer['sliced'])]

print(" "*11 + f"Rhapsody   chain    reduced   sliced")

for (s_i, r_i) in pred_sets:
    print(f'{s_i:12}', end='')
    for (s_j, r_j) in pred_sets:
        rho = spearmanr(r_i.mixPreds['score'][sel], r_j.mixPreds['score'][sel])
        print(f'{rho[0]:5.3f}', end=' '*5)
    print()

           Rhapsody   chain    reduced   sliced
Rhapsody    1.000     0.731     0.729     0.735     
chain       0.731     1.000     0.986     0.937     
reduced     0.729     0.986     1.000     0.933     
sliced      0.735     0.937     0.933     1.000     


## Predictions of known SAVs in the Integrated Dataset
The original Integrated Dataset contains a few EGFR variants with clinical interpretations.

In [14]:
ID_infos = np.load('../00-Training_Dataset/local/data/Integrated_Dataset-SAVs.npy')
known_SAVs = ID_infos[ [s.startswith('P00533') for s in ID_infos['SAV_coords']] ]
len(known_SAVs)

32

In [15]:
# comparison with 'true_labels' found in the Integrated Dataset

abbrv = {'?': '?', 'neutral': 'neu', 'prob.neutral': 'p.neu',
         'deleterious': 'del', 'prob.delet.': 'p.del'}

print(f"SAV                Rhapsody/chain/reduced/sliced   true_labels")

for i, s in enumerate(rh.SAVcoords['text']):
    # Rhapsody predictions
    preds = ""
    for SAV in known_SAVs:
        if s == SAV['SAV_coords']:
            for r in [rh, rh_dimer['chain'], rh_dimer['reduced'], rh_dimer['sliced']]:
                p = r.mixPreds[i]['path. class']
                preds += f"{abbrv[p]:6} " 
            if SAV['ClinVar_review_star'] != -1:
                stars = "*" * SAV['ClinVar_review_star']
            else:
                stars = ""
            print(f"{SAV['SAV_coords']:20} {preds} {SAV['true_label']:2}  {SAV['datasets']}{stars}" )

SAV                Rhapsody/chain/reduced/sliced   true_labels
P00533 98 R Q        neu    ?      ?      ?        0  humsavar[0]
P00533 266 P R       neu    ?      ?      ?        0  humsavar[0],swissvar[0]
P00533 428 G D       del    ?      ?      ?        1  humsavar[1],clinvar[1]
P00533 521 R K       neu    ?      ?      ?        0  humsavar[0],humvar[0],clinvar[0]**
P00533 674 V I       neu    ?      ?      ?        0  humsavar[0],swissvar[0]
P00533 709 E A       del    neu    p.neu  p.neu   -1  humsavar[0],varibench[1],clinvar[0]*
P00533 709 E G       del    del    del    del     -1  humsavar[0],swissvar[1],clinvar[0]*
P00533 709 E K       p.del  neu    neu    neu     -1  humsavar[0],varibench[1],clinvar[0]
P00533 719 G A       del    del    del    del     -1  humsavar[0],varibench[1],clinvar[1]*
P00533 719 G C       del    del    del    del     -1  humsavar[0],swissvar[1],varibench[1],clinvar[1]
P00533 719 G D       del    del    del    del     -1  humsavar[0],varibench[1]
P00533

## Figures
We can plot the average predictions on the PDB structures to highlight the differences between the various approaches.

In [16]:
if not os.path.isdir('local/figures'):
    os.mkdir('local/figures')

In [17]:
for case in ['Rhapsody', 'chain', 'reduced', 'sliced']:
    if case == 'Rhapsody':
        PDBID = '3GOP'
        r = rh
        pdb = prody.parsePDB('3GOP')
    else:
        PDBID = '2gs6'
        r = rh_dimer[case]
        pdb = prody.parsePDB(dimer_pdb_file)
        
    probs = {} 
    for U2PDBmap, pred in zip(r.Uniprot2PDBmap['PDB SAV coords'], 
                              r.mixPreds['path. probability']):
        if not U2PDBmap.startswith(PDBID):
            continue
        res = int(U2PDBmap.split()[2])
        probs.setdefault(res, [])
        probs[res].append(pred)

    PDBresids = pdb.getResnums()
    new_betas = np.zeros_like(PDBresids, dtype=float)
    for i, res in enumerate(PDBresids):
        if res in probs:
            x = np.nanmean(probs[res])
            beta = -1 if np.isnan(x) else x
        else:
            beta = -1
        new_betas[i] = beta
    # write modified PDB
    pdb.setBetas(new_betas)
    f = prody.writePDB(f'local/figures/{case}-mapped', pdb)

@> PDB file is found in the local folder (/home/lponzoni/.../3gop.pdb.gz).
@> 2385 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> 9984 atoms and 1 coordinate set(s) were parsed in 0.09s.
@> 9984 atoms and 1 coordinate set(s) were parsed in 0.09s.
@> 9984 atoms and 1 coordinate set(s) were parsed in 0.09s.
