# Getting started with Rhapsody

## Installation
Follow instructions in the git repository [README file](https://github.com/luponzo86/rhapsody/blob/master/README.md).

## Initial configuration

In [1]:
import rhapsody as rd

In [2]:
rd.pathRhapsodyFolder()

'/home/lponzoni/Downloads/temporary_folder_for_Rhapsody'

In [3]:
rd.pathEVmutationFolder()

'/home/lponzoni/Data/025-EVmutation/mutation_effects'

## Training of default classifiers

In [4]:
import os, tarfile, glob, pickle
import numpy as np
import prody as pd

In [5]:
if not os.path.isdir('local'):
    os.mkdir('local')

In [6]:
# extract data
tar = tarfile.open('../paper_Supplementary_Info/00-Training_Dataset/data.tar.gz', "r:gz")
tar.extractall(path='local')
tar.close()

In [7]:
# import precomputed features
ID = np.load('local/data/precomputed_features-ID_opt.npy')

In [8]:
featsets = {
    'v2': [        # full classifier
        'wt_PSIC', 'Delta_PSIC', 'SASA', 
        'ANM_MSF-chain', 'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
        'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM'
    ],
    'v2_noPfam': [ # reduced classifier
        'wt_PSIC', 'Delta_PSIC', 'SASA', 
        'ANM_MSF-chain', 'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
        'stiffness-chain', 'BLOSUM'
    ],
    'v2_EVmut': [  # full classifier + EVmutation epistatic score
        'wt_PSIC', 'Delta_PSIC', 'SASA', 
        'ANM_MSF-chain', 'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
        'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM', 'EVmut-DeltaE_epist'
    ],
}

In [9]:
if not os.path.isdir('local/results'):
    os.mkdir('local/results')

In [10]:
if os.path.isfile('local/results/RF_training_summaries.pkl'):
    print('A pickle containing precomputed results has been found.')
    print('Please delete it if you wish to run the training procedure again.')
else:
    pd.LOGGER.start('local/results/RF_training.log')
    RF_training_summaries = {}

    for version in ['v2', 'v2_noPfam', 'v2_EVmut']:
        
        featset = ['SAV_coords', 'true_label'] + featsets[version]

        pd.LOGGER.info(f'VERSION: {version}')

        # create folder
        folder = f'local/results/RF_training-{version}'
        os.mkdir(folder)

        # run training procedure
        output_dict = rd.trainRFclassifier(ID[featset])
        RF_training_summaries[version] = output_dict['CV summary']

        # move trained classifier and figures into folder
        for file in glob.glob('*png') + ['trained_classifier.pkl',]:
            os.rename(file, os.path.join(folder, file))

        pd.LOGGER.info('')
                    
    # store all cross-validation results into a pickle
    pickle.dump(RF_training_summaries, open('local/results/RF_training_summaries.pkl', 'wb'))

    pd.LOGGER.close('local/results/RF_training.log')

A pickle containing precomputed results has been found.
Please delete it if you wish to run the training procedure again.


## Testing

In [11]:
if not os.path.isdir('local/results/predictions'):
    os.mkdir('local/results/predictions')

In [12]:
test_SAVs = ['O00294 496 A T', 'O00238 31 R H']

os.chdir('local/results/predictions')

rh = rd.rhapsody(test_SAVs, '../RF_training-v2/trained_classifier.pkl',
                 aux_classifier='../RF_training-v2_noPfam/trained_classifier.pkl')

os.chdir('../../..')

@> Logging into file: rhapsody-log.txt
@> Logging started at 2019-06-13 13:58:04.520839
@> Imported feature set: 'wt_PSIC'
@>                       'Delta_PSIC'
@>                       'SASA'
@>                       'ANM_MSF-chain'
@>                       'ANM_effectiveness-chain'
@>                       'ANM_sensitivity-chain'
@>                       'stiffness-chain'
@>                       'entropy'
@>                       'ranked_MI'
@>                       'BLOSUM'
@> Submitting query to PolyPhen-2...
@> Query to PolyPhen-2 started in 12.8s.
@> PolyPhen-2 is running...
@> Query to PolyPhen-2 completed in 1.3s.
@> PolyPhen-2's output parsed.
@> Sequence-conservation features have been retrieved from PolyPhen-2's output.
@> Mapping SAVs to PDB structures...
@> [1/2] Mapping SAV 'O00238 31 R H' to PDB...
@> Pickle 'UniprotMap-O00238.pkl' recovered.
@> [2/2] Mapping SAV 'O00294 496 A T' to PDB...
@> Pickle 'UniprotMap-O00238.pkl' saved.
@> Pickle 'UniprotMap-O00294.pkl' recove

In [13]:
rh.predictions

array([(0.102, 0.05459946, 'neutral', 'known_neu'),
       (  nan,        nan, '?', 'new')],
      dtype=[('score', '<f4'), ('path. probability', '<f4'), ('path. class', '<U12'), ('training info', '<U12')])