# One-Cluster-Out: Whole Sequence

## Setup

In [1]:
import logging
import os
import pickle
import sys
import warnings
from collections import defaultdict

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import cross_validate, LeaveOneGroupOut

import bin.baseline_models as bm
import bin.feature_generators as fg
import bin.params as p
import bin.utils as utils

In [2]:
# developer settings
N_JOBS = -1 # parallel (production) 
# N_JOBS = 1 # sequential (development)
VERBOSE = 0
ERROR_SCORE = 'raise'
RANDOM_STATE = 2

In [3]:
# parameters
command = f'mkdir -p {p.DATA_DIR}/csv/garbage'
! $command
CHAINS = 'H'
MODEL_NAME = 'randomforest2'
INPUT_CLUSTER_FILE = f'{p.DATA_DIR}/csv/clustered_splits/sim80_all.csv'
OUTPUT_FILE = f'{p.DATA_DIR}/csv/garbage/sim80_randomforest2_{CHAINS}.csv'

In [4]:
PARAMS = {'compress': False}
FEATURES = 'lco_whole_sequence'

In [5]:
with open('../../data/pickles/models.p', 'rb') as models_file:
    models = pickle.load(models_file)
    model = models[MODEL_NAME]
model

RandomForestRegressor(n_estimators=15, n_jobs=-1)

In [6]:
logger = utils.setup_logging(MODEL_NAME, INPUT_CLUSTER_FILE, OUTPUT_FILE)

INFO:papermill:model randomforest2 input: ../../data/csv/clustered_splits/sim80_all.csv output: ../../data/csv/garbage/sim80_randomforest2_H.csv


In [7]:
clusters_df = pd.read_csv(INPUT_CLUSTER_FILE, index_col=0)
clusters_df.head(n=2)

Unnamed: 0,sequence_id,cluster
0,12E8:L,2
1,15C8:L,2


In [8]:
clusters_df['cluster'].value_counts()[:3]

2     4110
1      932
16     634
Name: cluster, dtype: int64

In [10]:
X, Y = utils.load_dataset(['train', 'val'], chains=CHAINS)
X, Y, c = fg.generate(X, Y, c=clusters_df, model_name=MODEL_NAME, 
                      features=FEATURES, params=PARAMS)

before merge with clusters X.shape (2643, 165) Y.shape (2643, 165) c.shape (6572, 2)
after merge with clusters X.shape (2643, 165) Y.shape (2643, 165) c.shape (2643, 2)
whole sequence
X.shape (2643, 165) Y.shape (2643, 165)
after non-data column drop: X.shape (2643, 164) Y.shape (2643, 164)
after one-hot encode: X.shape (2643, 3608) Y.shape (2643, 164)
[FINAL] X.shape (2643, 3608) Y.shape (2643, 164)
[FINAL] c.shape (2643, 2)


In [11]:
X.head(n=2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3598,3599,3600,3601,3602,3603,3604,3605,3606,3607
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
Y.head(n=2) if hasattr(Y, 'head') else Y

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
0,100.0,36.0,50.1,4.4,51.9,3.5,28.8,-1.0,68.3,71.2,...,28.5,13.3,1.4,-1.0,30.0,2.2,19.3,4.3,14.3,75.0
1,100.0,23.3,51.7,4.5,54.8,5.1,27.2,-1.0,65.8,80.3,...,51.2,12.0,2.5,-1.0,46.4,4.6,26.7,7.4,18.3,61.4


In [12]:
c.head(n=2)

Unnamed: 0,c_sequence_id,c_cluster
0,12E8:H,2
1,15C8:H,2


In [13]:
assert c.shape[0] == X.shape[0] == Y.shape[0]
c.shape, X.shape, Y.shape

((2643, 2), (2643, 3608), (2643, 164))

---

## Clustered CV

In [12]:
loss = make_scorer(utils.avg_deviation, greater_is_better=False)

In [13]:
groups = c.loc[X.index, 'c_cluster']
groups.shape

(2643,)

In [59]:
#pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', model),#])
#pipeline = MultiOutputRegressor(estimator=pipeline)
pipeline = model

In [60]:
logger.info(f' CV start for model {MODEL_NAME}')

INFO:papermill: CV start for model randomforest2


In [61]:
splitter = LeaveOneGroupOut()
split = splitter.split(X, Y, groups=groups)

In [62]:
if MODEL_NAME == 'svr':
    X_copy = X
    Y_copy = Y
    X = X.to_numpy()
    Y = Y.to_numpy()

In [63]:
X.shape, Y.shape

((2643, 3608), (2643, 164))

In [64]:
scores = cross_validate(pipeline, X, Y, groups=groups, 
                        scoring=loss, 
                        return_train_score=True, return_estimator=True,
                        n_jobs=N_JOBS, verbose=VERBOSE,
                        error_score=ERROR_SCORE,
                        cv=split)

---

## Store the results

In [67]:
scores_df = pd.DataFrame(scores).sort_index()

# we have loss function for evaluating quality
# https://stackoverflow.com/questions/21443865/scikit-learn-cross-validation-negative-values-with-mean-squared-error
# using minus for loss metric is recommended way - otherwise all the scores will be negative!
scores_df['train_score'] = -scores_df['train_score']
scores_df['test_score'] = -scores_df['test_score']
scores_df['cv_split_id'] = scores_df.index
scores_df['model'] = MODEL_NAME
scores_df.to_csv(OUTPUT_FILE)

In [1]:
scores_df = pd.read_csv(OUTPUT_FILE, index_col=0)
scores_df.head(n=3)

NameError: name 'pd' is not defined