# One-Cluster-Out: Continuous Windows

## Setup

In [23]:
import logging
import os
import pickle
import sys
import warnings
from collections import defaultdict

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate, LeaveOneGroupOut
from tqdm.notebook import tqdm

import bin.baseline_models as bm
import bin.feature_generators as fg
import bin.params as p
import bin.utils as utils

In [24]:
# developer settings
N_JOBS = -1 # parallel (production) 
#N_JOBS = 1 # sequential (development)
VERBOSE = 1
ERROR_SCORE = 'raise'
RANDOM_STATE = 2

In [25]:
# parameters
CHAINS = 'H'
MODEL_NAME = 'decisiontree'
INPUT_CLUSTER_FILE = f'{p.DATA_DIR}/csv/clustered_splits/sim80_all.csv'
OUTPUT_FILE = f'{p.DATA_DIR}/csv/lco_cont_window_r1_all_H/sim80_decisiontree_r1_all_H.csv'
WINDOW_RADIUS = 1

In [26]:
utils.setup_logging(MODEL_NAME, INPUT_CLUSTER_FILE, OUTPUT_FILE)

INFO:papermill:model decisiontree input: ../../data/csv/clustered_splits/sim80_all.csv output: ../../data/csv/lco_cont_window_r1_all_H/sim80_decisiontree_r1_all_H.csv


<Logger papermill (INFO)>

**Load empty pickled model:**

In [27]:
with open(f'{p.DATA_DIR}/pickles/models.p', 'rb') as models_file:
    models = pickle.load(models_file)
    model = models[MODEL_NAME]
    
model

DecisionTreeRegressor()

In [28]:
PARAMS = {'radius': WINDOW_RADIUS}
FEATURES = f'lco_cont_window_r{WINDOW_RADIUS}'

**Create the directory with experiment name to store results in:**

In [29]:
RESULTS_DIR_PATH = os.path.dirname(OUTPUT_FILE)
command = f'mkdir -p {RESULTS_DIR_PATH}'
! $command

**Load the input data:**

In [30]:
clusters_df = pd.read_csv(INPUT_CLUSTER_FILE, index_col=0)
clusters_df.head(n=2)

Unnamed: 0,sequence_id,cluster
0,12E8:L,2
1,15C8:L,2


In [31]:
X, Y = utils.load_dataset(['train', 'val'], chains=CHAINS)
X, Y, c = fg.generate(X, Y, c=clusters_df, model_name=MODEL_NAME, 
                      features=FEATURES, params=PARAMS)

before merge with clusters X.shape (2643, 165) Y.shape (2643, 165) c.shape (6572, 2)
after merge with clusters X.shape (2643, 165) Y.shape (2643, 165) c.shape (2643, 2)
lco_cont_window_r1
X.shape (2643, 165) Y.shape (2643, 165)
after drop_nondata_columns: X.shape (2643, 164) Y.shape (2643, 164)
after _add_sequence_end: X.shape (2643, 166) Y.shape (2643, 166)
after window transforms: X_window.shape (433452, 5) Y_window.shape (433452, 1)
c_window.shape (433452, 2)
[FINAL] X.shape (433452, 67) Y.shape (433452, 1)
[FINAL] c.shape (433452, 2)


In [32]:
X.tail(n=2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
2641,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.993902
2642,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.993902


In [33]:
Y.head(n=2) if hasattr(Y, 'head') else Y

array([100. , 100. ,  -1. , ...,  -1. , 100. ,  66.9])

In [34]:
c.head(n=2)

Unnamed: 0,c_sequence_id,c_cluster
0,12E8:H,2
1,15C8:H,2


---

## Clustered CV

In [35]:
loss = make_scorer(utils.avg_deviation, greater_is_better=False)

In [36]:
groups = c.loc[X.index, 'c_cluster']
groups.shape

(433452,)

In [37]:
split = LeaveOneGroupOut().split(X, Y, groups=groups)

In [38]:
X.shape, Y.shape, groups.shape

((433452, 67), (433452,), (433452,))

In [39]:
if MODEL_NAME == 'kernelridge':
    # Nystroem should enhance KR model performance
    # Does not seem to help me much tho
    from sklearn.pipeline import Pipeline
    from sklearn.kernel_approximation import Nystroem
    pipeline = Pipeline([
        ('n', Nystroem()), 
        ('m', model)
    ])
    model = pipeline

In [40]:
scores = cross_validate(model, X, Y, 
                            groups=groups,
                            n_jobs=N_JOBS,
                            scoring=loss, 
                            verbose=VERBOSE, 
                            cv=split,
                            return_train_score=True,
                            return_estimator=True,
                            error_score=ERROR_SCORE)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done 132 out of 132 | elapsed:   48.6s finished


___

## Store results

In [41]:
scores_df = pd.DataFrame(scores).sort_index()
# we have loss function for evaluating quality
# https://stackoverflow.com/questions/21443865/scikit-learn-cross-validation-negative-values-with-mean-squared-error
# using minus for loss metric is recommended way - otherwise all the scores will be negative!
scores_df['train_score'] = -scores_df['train_score']
scores_df['test_score'] = -scores_df['test_score']
scores_df['cv_split_id'] = scores_df.index
scores_df['model'] = MODEL_NAME

In [42]:
scores_df.head(n=3)

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score,cv_split_id,model
0,6.550395,0.136296,DecisionTreeRegressor(),4.977038,3.261297,0,decisiontree
1,5.271857,0.939505,DecisionTreeRegressor(),4.689994,3.125527,1,decisiontree
2,9.18591,0.02653,DecisionTreeRegressor(),4.014081,3.308872,2,decisiontree


In [43]:
scores_df.to_csv(OUTPUT_FILE)