# One-Cluster-Out: Regional Composite

Idea here is to firstly predict exposure using baseline `KNNwholeseqN3` model and use the prediction as feature for further prediction of the final exposure value using some of the more advanced models.

In [1]:
import logging
import pickle
import sys 
import pandas as pd
from collections import defaultdict

import abnumber
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate, LeaveOneGroupOut
from tqdm.notebook import tqdm 

import bin.utils as utils

In [82]:
# developer settings
N_JOBS = -1 # parallel (production) 
# N_JOBS = 1 # sequential (development)
VERBOSE = 1
ERROR_SCORE = 'raise'
RANDOM_STATE = 2

In [3]:
seq = 'DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKLMIYSASNRYTGVPDRFTGSGSGTDFTLTISNMQSEDLADYFCQQYSSYPLTFGAGTKLELK'
chain = abnumber.Chain(seq, scheme='aho', cdr_definition='chothia', assign_germline=True)
chain.species

'mouse'

In [10]:
MODEL_NAME = 'decisiontree'
INPUT_CLUSTER_FILE = '../../data/csv/clustered_splits/sim80.csv'
OUTPUT_FILE = '../../data/csv/lco_regional_simple/sim80_decisiontree.csv'

# possible values: region,position,chain,species (separated by commas)
FEATURES = 'region,position,chain,species'


In [11]:
logger = utils.setup_logging(MODEL_NAME, INPUT_CLUSTER_FILE, OUTPUT_FILE)

INFO:papermill:model decisiontree input: ../../data/csv/clustered_splits/sim80.csv output: ../../data/csv/lco_cont_window_4/sim80_decisiontree.csv


In [12]:
with open('../../data/pickles/models.p', 'rb') as models_file:
    models = pickle.load(models_file)
    model = models[MODEL_NAME]

model

DecisionTreeRegressor()

In [5]:
clusters_df = pd.read_csv(INPUT_CLUSTER_FILE, index_col=0)
X = pd.read_csv('../../data/csv/fasta_aho_L.csv')
Y = pd.read_csv('../../data/csv/sasa_aligned_L.csv').rename(columns={'Unnamed: 0':'Id'})

In [6]:
# do basic processing (records filtering, join cluster data...)
X, Y = utils.load_dataset(['train', 'val'])
print(f'Load datasets - X: {X.shape}, Y: {Y.shape}')

c, X, Y = utils.include_clusters(clusters_df, X, Y)
print(f'Join cluster datasets - c: {c.shape} X: {X.shape} Y: {Y.shape}')

Load datasets - X: (3414, 156), Y: (3414, 156)
Join cluster datasets - c: (3414, 2) X: (3414, 156) Y: (3414, 156)


In [13]:
features = FEATURES.split(',')

In [87]:
# this may take a while
X_clean = X.copy(); X_clean.index = X_clean['Id']
X_records = X_clean.drop(columns='Id').to_dict(orient='records')
chains_dict = dict()
chain_data_pairs = list(zip(X['Id'], X_records))

for chain_full_id, chain_dict in tqdm(chain_data_pairs):
    sequence =  ''.join(chain_dict.values()).replace('-', '')
    chain = abnumber.Chain(sequence, scheme='aho', cdr_definition='chothia', assign_germline=True)
    chains_dict[chain_full_id] = chain
# chains_dict = LFAQ: HAKBLKAGKLHLFHLAF (sequence) ...

  0%|          | 0/3414 [00:00<?, ?it/s]

In [85]:
chains_dict['12E8:L']

DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKLMIYSASNRYTGVPDRFTGSGSGTDFTLTISNMQSEDLADYFCQQYSSYPLTFGAGTKLELK
                       ^^^^^^^^^^^               ^^^^^^^                                ^^^^^^^^^          

In [43]:
Y_clean = Y.copy(); Y_clean.index = Y_clean['Id']
X_data, Y_data = defaultdict(list), defaultdict(list)

for chain_full_id, chain in tqdm(chains_dict.items()): 
    for region, positions in chain.regions.items():
        for position, residue in positions.items():
            chain_type, pos_code = position.chain_type, position.format(chain_type=False)
            if 'region' in features: X_data['region'].append(region)
            if 'position' in features: X_data['position'].append(pos_code)
            if 'chain' in features: X_data['chain'].append(chain_type)
            if 'species' in features: X_data['species'].append(chain.species)
            X_data['chain_full_id'].append(chain_full_id)
            X_data['residue'].append(residue)
            Y_data['sasa'].append(Y_clean.loc[chain_full_id, pos_code])

  0%|          | 0/3414 [00:00<?, ?it/s]

In [67]:
X_regional = pd.DataFrame(X_data) # .drop(columns='chain_full_id')
Y_regional = pd.DataFrame(Y_data, index=range(len(Y_data['sasa'])))

In [68]:
X_regional.head(n=2)

Unnamed: 0,region,position,chain,chain_full_id,residue
0,FR1,1,K,12E8:L,D
1,FR1,2,K,12E8:L,I


In [65]:
X_regional.shape, Y_regional.shape

((368947, 4), (368947, 1))

In [70]:
X_regional_c_df = X_regional.merge(c, left_on='chain_full_id', right_on='c_sequence_id')
groups = X_regional_c_df['c_cluster']

0         2
1         2
2         2
3         2
4         2
         ..
368942    2
368943    2
368944    2
368945    2
368946    2
Name: c_cluster, Length: 368947, dtype: int64

In [61]:
X_regional_oh = pd.get_dummies(X_regional, prefix=features + ['residue'])
X_regional_oh.head(n=2)

Unnamed: 0,region_CDR1,region_CDR2,region_CDR3,region_FR1,region_FR2,region_FR3,region_FR4,position_1,position_10,position_100,...,residue_M,residue_N,residue_P,residue_Q,residue_R,residue_S,residue_T,residue_V,residue_W,residue_Y
0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Clustered CV

In [77]:
split = LeaveOneGroupOut().split(X_regional, Y_regional, groups=groups)

In [78]:
X_regional.shape, Y_regional.shape, groups.shape

((368947, 5), (368947, 1), (368947,))

In [81]:
loss = make_scorer(utils.avg_deviation, greater_is_better=False)

In [None]:
scores = cross_validate(model, X_regional, Y_regional, 
                            groups=groups,
                            n_jobs=N_JOBS,
                            scoring=loss, 
                            verbose=VERBOSE, 
                            cv=split,
                            return_train_score=True,
                            return_estimator=True,
                            error_score=ERROR_SCORE)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.


## Store results

In [90]:
scores_df = pd.DataFrame(scores).sort_index()
# we have loss function for evaluating quality
# https://stackoverflow.com/questions/21443865/scikit-learn-cross-validation-negative-values-with-mean-squared-error
# using minus for loss metric is recommended way - otherwise all the scores will be negative!
scores_df['train_score'] = -scores_df['train_score']
scores_df['test_score'] = -scores_df['test_score']
scores_df['cv_split_id'] = scores_df.index
scores_df['model'] = MODEL_NAME

NameError: name 'scores' is not defined

In [301]:
scores_df.head(n=3)

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score,cv_split_id,model
0,16.638938,0.157654,DecisionTreeRegressor(),4.165215,3.093821,0,decisiontree
1,3.933675,1.576109,DecisionTreeRegressor(),5.025148,2.858277,1,decisiontree
2,19.15597,0.011132,DecisionTreeRegressor(),5.572649,3.089625,2,decisiontree


In [299]:
scores_df.to_csv(OUTPUT_FILE)