# One-Cluster-Out: Regional Simple

In [1]:
import logging
import os
import pickle
import sys 
from collections import defaultdict

import abnumber
import numpy as np
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate, LeaveOneGroupOut
from tqdm.notebook import tqdm 

import bin.baseline_models as bm
import bin.feature_generators as fg
import bin.params as p
import bin.utils as utils

In [9]:
# developer settings
N_JOBS = -1 # parallel (production) 
# N_JOBS = 1 # sequential (development)
VERBOSE = 0
ERROR_SCORE = 'raise'
RANDOM_STATE = 2

In [10]:
CHAINS = 'H'
MODEL_NAME = 'decisiontree'
INPUT_CLUSTER_FILE = f'{p.DATA_DIR}/csv/clustered_splits/sim80_{p.CLUSTERING_CHAINS}.csv'
OUTPUT_FILE = f'{p.DATA_DIR}/csv/garbage/lco_regional_simple_region_sim80_randomforest2_{CHAINS}.csv'

# possible values: region,position,chain,species (pick multiple options and separate by commas)
#FEATURES = 'region,position,chain,species'
FEATURES = 'region'

In [11]:
EXPERIMENT_NAME = f'lco_regional_simple_{FEATURES}'
PARAMS = {}

In [12]:
logger = utils.setup_logging(MODEL_NAME, INPUT_CLUSTER_FILE, OUTPUT_FILE)

INFO:papermill:model decisiontree input: ../../data/csv/clustered_splits/sim80_all.csv output: ../../data/csv/garbage/lco_regional_simple_region_sim80_randomforest2_H.csv


In [13]:
with open('../../data/pickles/models.p', 'rb') as models_file:
    models = pickle.load(models_file)
    model = models[MODEL_NAME]
model

DecisionTreeRegressor()

In [14]:
clusters_df = pd.read_csv(INPUT_CLUSTER_FILE, index_col=0)

In [15]:
X, Y = utils.load_dataset(['train', 'val'], chains=CHAINS)
X, Y, c = fg.generate(X, Y, c=clusters_df, model_name=MODEL_NAME, 
                      features=EXPERIMENT_NAME, params=PARAMS)

c.shape (6572, 2) X.shape (2643, 165) Y.shape (2643, 165)
after merge with clusters - c.shape (2643, 2) X.shape (2643, 165) Y.shape (2643, 165)
X.shape (2643, 165) Y.shape (2643, 165)
after transformation X.shape (2643, 165) Y.shape (2643, 165) c.shape (2643, 2)
[FINAL] after one-hot encode X.shape (318124, 27) Y.shape (318124, 1)
[FINAL] c.shape (318124, 2)


In [16]:
X.head(n=2)

Unnamed: 0,region_CDR1,region_CDR2,region_CDR3,region_FR1,region_FR2,region_FR3,region_FR4,residue_A,residue_C,residue_D,...,residue_M,residue_N,residue_P,residue_Q,residue_R,residue_S,residue_T,residue_V,residue_W,residue_Y
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
Y.head(n=2) if hasattr(Y, 'head') else Y

array([100. ,  36. ,  50.1, ...,   0.6,   2.1,  66.9])

In [18]:
c.head(n=2)

Unnamed: 0,c_sequence_id,c_cluster
0,12E8:H,2
1,12E8:H,2


---

**Following cell needs not to be ran:**

---

## Clustered CV

In [19]:
groups = c.loc[X.index, 'c_cluster']
groups.shape

(318124,)

In [20]:
split = LeaveOneGroupOut().split(X, Y, groups=groups)

In [21]:
loss = make_scorer(utils.avg_deviation, greater_is_better=False)

In [22]:
scores = cross_validate(model, X, Y, 
                            groups=groups,
                            n_jobs=N_JOBS,
                            scoring=loss, 
                            verbose=VERBOSE, 
                            cv=split,
                            return_train_score=True,
                            return_estimator=True,
                            error_score=ERROR_SCORE)

___

## Store results

In [26]:
scores_df = pd.DataFrame(scores).sort_index()
# we have loss function for evaluating quality
# https://stackoverflow.com/questions/21443865/scikit-learn-cross-validation-negative-values-with-mean-squared-error
# using minus for loss metric is recommended way - otherwise all the scores will be negative!
scores_df['train_score'] = -scores_df['train_score']
scores_df['test_score'] = -scores_df['test_score']
scores_df['cv_split_id'] = scores_df.index
scores_df['model'] = MODEL_NAME

In [27]:
scores_df.head(n=3)

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score,cv_split_id,model
0,2.635499,0.023269,DecisionTreeRegressor(),14.640325,14.435588,0,decisiontree
1,1.161194,0.121765,DecisionTreeRegressor(),14.47353,14.599929,1,decisiontree
2,3.087378,0.007994,DecisionTreeRegressor(),15.013484,14.459733,2,decisiontree


In [25]:
scores_df.to_csv(OUTPUT_FILE)