# 3b. Predict on the TEST set

## Setup

In [50]:
import sys
sys.path.append('../..')

import math
import pickle
import time

#import abnumber
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

import bin.feature_generators as fg
import bin.params as p
import bin.utils as u

In [51]:
sns.set_theme()

**Papermill parameters:**

In [52]:
MODEL_NAME = 'randomForestN5'
FEATURES = 'lco_cont_window_r3_all_H'

In [53]:
PARAMS = {
    'compress': False,
    'preserve_seq_ids': True,
}

**Create directories to store results in:**

In [54]:
TEST_Y_RAW_DIR_PATH = f'{p.DATA_DIR}/csv/test/raw_y'
command = f'mkdir -p {TEST_Y_RAW_DIR_PATH}'
! $command

In [55]:
TEST_PREDICTIONS_RAW_DIR_PATH = f'{p.DATA_DIR}/csv/test/raw_predictions'
command = f'mkdir -p {TEST_PREDICTIONS_RAW_DIR_PATH}'
! $command

In [56]:
TEST_PREDICTIONS_DIR_PATH = f'{p.DATA_DIR}/csv/test/predictions'
command = f'mkdir -p {TEST_PREDICTIONS_DIR_PATH}'
! $command

**Load the test dataset:**

In [57]:
chains = FEATURES.split('_')[-1]
X_orig, Y_orig = u.load_dataset('test', chains=chains)
X, Y = X_orig.copy(), Y_orig.copy()

**Transform the data based on which feature representation did we choose:**

In [58]:
X, Y, _ = fg.generate(X, Y, c=None, model_name=MODEL_NAME, 
                   features=FEATURES, params=PARAMS)
ids = X[X.columns[-1]].reset_index(drop=True)
X.drop(columns = X.columns[-1], inplace=True)
X.shape, Y.shape, ids.shape

lco_cont_window_r3_all_H
X.shape (643, 165) Y.shape (643, 165)
after drop_nondata_columns: X.shape (643, 164) Y.shape (643, 164)
after _add_sequence_end: X.shape (643, 170) Y.shape (643, 170)
after window transforms: X_window.shape (105452, 9) Y_window.shape (105452, 1)
[FINAL] X.shape (105452, 156) Y.shape (105452, 1)


((105452, 155), (105452,), (105452,))

In [63]:
N_SEQUENCES = len(ids.unique())
N_POSITIONS = len(X_orig.columns)-1 # -1 -> since ID column does not count
N_SEQUENCES, N_POSITIONS

(643, 164)

In [64]:
X.head(n=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,145,146,147,148,149,150,151,152,153,154
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [65]:
Y if type(Y) is np.ndarray else Y.head(n=1)

array([-1. , -1. , -1. , ..., 58.8, 73.3, 71.5])

In [66]:
ids.head(n=3)

0    6LCS:H
1    6LDV:H
2    6LDW:H
Name: 156, dtype: object

**Load the trained model:**

In [67]:
with open(f'{p.DATA_DIR}/pickles/trained-test-models/{FEATURES}_{MODEL_NAME}.p', 
          'rb') as trained_model_file:
    model = pickle.load(trained_model_file)

---

## Predict

In [68]:
predict_start = time.time()
predictions = model.predict(X).round(2)
predict_end = time.time()
print(f'Prediction took {predict_end-predict_start:.2f}s, the result datatype is {type(predictions)} and shape {predictions.shape}')

Prediction took 0.16s, the result datatype is <class 'numpy.ndarray'> and shape (105452,)


**If predictions are stored in `np.array`, perform the conversion to the `whole_sequence`-like dataframe:**

In [69]:
predictions

array([-1.        , -1.        , -1.        , ..., 72.12288016,
       72.12288016, 72.12288016])

In [70]:
if type(predictions) is np.ndarray:
    # convert to dataframe
    Y_pred = Y_orig.copy()
    Y_pred.index = Y_orig['Id']
    Y_pred.drop(columns='Id', inplace=True)
    for i, p in tqdm(enumerate(predictions), total=len(predictions), 
                     desc='Processing individual predictions...'):
        seq_id = ids.iloc[i % N_SEQUENCES]
        x_index = math.floor(i / N_SEQUENCES)
        pos_id = X_orig.columns[x_index+1] # starting from 1 as 0 is 'id'
        Y_pred.loc[seq_id, pos_id] = predictions[i]
    Y_pred = Y_pred.replace(-1, np.nan)
    predictions = Y_pred

Processing individual predictions...:   0%|          | 0/105452 [00:00<?, ?it/s]

**Replace `-1` values by `np.nan`:**

In [71]:
predictions = predictions.replace(-1, np.nan)

In [72]:
predictions.head(n=2)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6LCS:H,,68.277048,53.040889,9.399539,50.834599,7.490667,33.560932,,52.82,53.102129,...,61.015999,17.554905,7.922589,,38.513904,1.08,23.678614,2.84,5.97,55.148
6LDV:H,,89.613333,49.025,6.66,62.126667,6.738833,47.661058,,43.780397,31.406057,...,59.251941,24.989066,12.517829,,27.06226,1.131882,14.886999,4.798802,20.876986,72.12288


**Save raw `y` and `predictions` objects so we can use the raw data in other notebooks:**

In [73]:
raw_y_path = f'{TEST_Y_RAW_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
Y_orig.to_csv(raw_y_path)

In [74]:
raw_preds_path = f'{TEST_PREDICTIONS_RAW_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
predictions.to_csv(raw_preds_path)

**Generate `sequence/position/sasa_prediction` dataframe (very nice structure for processing the results later on):**

In [75]:
res_df = u.positionize_sasa_df(predictions, ids)
res_df

Unnamed: 0,sequence_id,position,prediction
0,6LCS:H,1,
1,6LDV:H,1,
2,6LDW:H,1,
3,6LDX:H,1,97.143667
4,6LDY:H,1,
...,...,...,...
105447,7WRV:H,149,
105448,7X08:H,149,72.122880
105449,7X9E:H,149,72.122880
105450,7Z0X:H,149,72.122880


**Store it:**

In [76]:
preds_path = f'{TEST_PREDICTIONS_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
res_df.to_csv(preds_path)