# 3a. Train on the TRAIN+VAL sets

## Setup 

In [202]:
import sys; sys.path.append('../..')
import pickle
import time

import numpy as np
import pandas as pd

import bin.feature_generators as fg
import bin.params as p
import bin.utils as u

**Papermill parameters:**

In [203]:
MODEL_NAME = "BLknnwholeseqn10"
FEATURES = "lco_whole_sequence_all_H"

In [204]:
PARAMS = {'compress': False}

In [205]:
TRAINED_MODELS_DIR_PATH = f'{p.DATA_DIR}/pickles/trained-test-models'
TRAINED_MODELS_LIST_FILE_PATH = f'{p.DATA_DIR}/pickles/models.p'

**Create a directory to store trained models in:**

In [206]:
print('trained models dir:', TRAINED_MODELS_DIR_PATH)
print('dir containing list of models to be trained:', TRAINED_MODELS_LIST_FILE_PATH)
command = f'mkdir -p {TRAINED_MODELS_DIR_PATH}'
! $command

trained models dir: ../../data/pickles/trained-test-models
dir containing list of models to be trained: ../../data/pickles/models.p


**Load the empty model:**

In [207]:
with open(TRAINED_MODELS_LIST_FILE_PATH, 'rb') as models_file:
    models = pickle.load(models_file)
    model = models[MODEL_NAME]
    print(model)

KNNWholeSequence(n_neighbors=10)


**Load the train dataset:**

In [208]:
chains = FEATURES.split('_')[-1]
X, Y = u.load_dataset(['train', 'val'], chains=chains)

load_dataset: ['train', 'val'], metadata file path: ../../data/csv/metadata/metadata_H.csv, chains: H, shape: (3286, 19)
load_dataset: ['train', 'val'], X file path: ../../data/csv/fasta_aligned_cleaned/fasta_aho_H.csv, chains: H, shape: (3286, 165)
load_dataset: ['train', 'val'], Y file path: ../../data/csv/sasa_aligned/sasa_H.csv, chains: H, shape: (3286, 165)


**Transform the data based on which feature representation did we choose:** (this may take a while)

In [209]:
X, Y, _ = fg.generate(X, Y, c=None, model_name=MODEL_NAME, 
                      features=FEATURES, params=PARAMS)
print('chains:', chains, '| X.shape:', X.shape, '| Y.shape:', Y.shape)

lco_whole_sequence_all_H
X.shape (2643, 165) Y.shape (2643, 165)
after non-data column drop: X.shape (2643, 164) Y.shape (2643, 164)
[NOTE] Skipping one-hot encoding, since this is baseline model
[FINAL] X.shape (2643, 164) Y.shape (2643, 164)
chains: H | X.shape: (2643, 164) | Y.shape: (2643, 164)


In [210]:
X.head(n=1)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
0,E,V,Q,L,Q,Q,S,-,G,A,...,Q,G,T,-,L,V,T,V,S,A


In [211]:
if isinstance(Y, pd.DataFrame):
    print('pd.dataFrame')
    print(Y.head(n=1))
elif isinstance(Y, np.ndarray):
    print('numpy array')
    print(Y)
else:
    print('Y is of unknown type')

pd.dataFrame
       1     2     3    4     5    6     7    8     9    10  ...   141   142  \
0  100.0  36.0  50.1  4.4  51.9  3.5  28.8 -1.0  68.3  71.2  ...  28.5  13.3   

   143  143A   144  145   146  147   148   149  
0  1.4  -1.0  30.0  2.2  19.3  4.3  14.3  75.0  

[1 rows x 164 columns]


---

## Train the model

**This may take a while:**

In [212]:
print('model:', model)
fit_start = time.time()
model.fit(X, Y)
fit_end = time.time()
print(f'model fitting took {fit_end-fit_start:.2f}s')

model: KNNWholeSequence(n_neighbors=10)
model fitting took 0.05s


---

## Save trained model

**Save the model to the `data/pickles/trained-test-models` directory:**

In [213]:
saved_model_file_path = f'{TRAINED_MODELS_DIR_PATH}/{FEATURES}_{MODEL_NAME}.p'
with open(saved_model_file_path, 'wb') as trained_model_file:
    print('saving the model to:', saved_model_file_path)
    pickle.dump(model, trained_model_file)

saving the model to: ../../data/pickles/trained-test-models/lco_whole_sequence_all_H_BLknnwholeseqn10.p
