# 3b. Predict on the TEST set

## Setup

In [1]:
import sys
sys.path.append('../..')

import math
import pickle
import time

#import abnumber
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

import bin.feature_generators as fg
import bin.params as p
import bin.utils as u

In [2]:
sns.set_theme()

**Papermill parameters:**

In [3]:
MODEL_NAME = 'randomForestN5'
FEATURES = 'lco_cont_window_r4_all_H'

In [4]:
PARAMS = {
    'compress': False,
    'preserve_seq_ids': True,
}

**Create directories to store results in:**

In [5]:
TEST_Y_RAW_DIR_PATH = f'{p.DATA_DIR}/csv/test/raw_y'
command = f'mkdir -p {TEST_Y_RAW_DIR_PATH}'
! $command

In [6]:
TEST_PREDICTIONS_RAW_DIR_PATH = f'{p.DATA_DIR}/csv/test/raw_predictions'
command = f'mkdir -p {TEST_PREDICTIONS_RAW_DIR_PATH}'
! $command

In [7]:
TEST_PREDICTIONS_DIR_PATH = f'{p.DATA_DIR}/csv/test/predictions'
command = f'mkdir -p {TEST_PREDICTIONS_DIR_PATH}'
! $command

**Load the test dataset:**

In [8]:
chains = FEATURES.split('_')[-1]
X_orig, Y_orig = u.load_dataset('test', chains=chains)
X, Y = X_orig.copy(), Y_orig.copy()

**Transform the data based on which feature representation did we choose:**

In [9]:
X, Y, _ = fg.generate(X, Y, c=None, model_name=MODEL_NAME, 
                   features=FEATURES, params=PARAMS)
ids = X[X.columns[-1]].reset_index(drop=True)
X.drop(columns = X.columns[-1], inplace=True)
X.shape, Y.shape, ids.shape

lco_cont_window_r4_all_H
X.shape (643, 165) Y.shape (643, 165)
after drop_nondata_columns: X.shape (643, 164) Y.shape (643, 164)
after _add_sequence_end: X.shape (643, 172) Y.shape (643, 172)
after window transforms: X_window.shape (105452, 11) Y_window.shape (105452, 1)
[FINAL] X.shape (105452, 200) Y.shape (105452, 1)


((105452, 199), (105452,), (105452,))

In [10]:
N_SEQUENCES = len(ids.unique())
N_POSITIONS = len(X_orig.columns)-1 # -1 -> since ID column does not count
N_SEQUENCES, N_POSITIONS

(643, 164)

In [11]:
X.head(n=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,189,190,191,192,193,194,195,196,197,198
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0.0


In [12]:
Y if type(Y) is np.ndarray else Y.head(n=1)

array([-1. , -1. , -1. , ..., 58.8, 73.3, 71.5])

In [13]:
ids.head(n=3)

0    6LCS:H
1    6LDV:H
2    6LDW:H
Name: 200, dtype: object

**Load the trained model:**

In [14]:
with open(f'{p.DATA_DIR}/pickles/trained-test-models/{FEATURES}_{MODEL_NAME}.p', 
          'rb') as trained_model_file:
    model = pickle.load(trained_model_file)

---

## Predict

In [15]:
predict_start = time.time()
predictions = model.predict(X).round(2)
predict_end = time.time()
print(f'Prediction took {predict_end-predict_start:.2f}s, the result datatype is {type(predictions)} and shape {predictions.shape}')

Prediction took 0.08s, the result datatype is <class 'numpy.ndarray'> and shape (105452,)


**If predictions are stored in `np.array`, perform the conversion to the `whole_sequence`-like dataframe:**

In [16]:
predictions

array([-1.  , -1.  , -1.  , ..., 71.96, 71.96, 71.96])

In [17]:
if type(predictions) is np.ndarray:
    # convert to dataframe
    Y_pred = Y_orig.copy()
    Y_pred.index = Y_orig['Id']
    Y_pred.drop(columns='Id', inplace=True)
    for i, p in tqdm(enumerate(predictions), total=len(predictions), 
                     desc='Processing individual predictions...'):
        seq_id = ids.iloc[i % N_SEQUENCES]
        x_index = math.floor(i / N_SEQUENCES)
        pos_id = X_orig.columns[x_index+1] # starting from 1 as 0 is 'id'
        Y_pred.loc[seq_id, pos_id] = predictions[i]
    Y_pred = Y_pred.replace(-1, np.nan)
    predictions = Y_pred

Processing individual predictions...:   0%|          | 0/105452 [00:00<?, ?it/s]

**Replace `-1` values by `np.nan`:**

In [19]:
predictions = predictions.replace(-1, np.nan)

In [20]:
predictions.head(n=2)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6LCS:H,,13.15,52.34,6.22,50.64,4.71,47.2,,50.9,48.37,...,61.3,18.52,8.13,,29.9,0.66,18.12,5.26,7.02,55.11
6LDV:H,,89.16,50.28,6.7,61.44,6.11,47.23,,44.6,31.49,...,57.77,24.58,9.21,,26.03,0.92,14.79,4.79,20.78,71.96


**Save raw `y` and `predictions` objects so we can use the raw data in other notebooks:**

In [21]:
raw_y_path = f'{TEST_Y_RAW_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
Y_orig.to_csv(raw_y_path)

In [22]:
raw_preds_path = f'{TEST_PREDICTIONS_RAW_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
predictions.to_csv(raw_preds_path)

In [26]:
predictions

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6LCS:H,,13.15,52.34,6.22,50.64,4.71,47.20,,50.90,48.37,...,61.30,18.52,8.13,,29.90,0.66,18.12,5.26,7.02,55.11
6LDV:H,,89.16,50.28,6.70,61.44,6.11,47.23,,44.60,31.49,...,57.77,24.58,9.21,,26.03,0.92,14.79,4.79,20.78,71.96
6LDW:H,,89.16,50.28,6.70,61.44,6.11,47.23,,44.60,31.49,...,57.31,24.21,9.21,,26.03,0.92,14.79,4.79,20.78,71.96
6LDX:H,96.45,87.73,52.74,5.31,63.85,6.08,48.50,,44.60,31.49,...,57.77,24.58,9.21,,26.03,0.92,14.79,4.79,20.78,71.96
6LDY:H,,13.68,29.61,11.68,63.02,5.01,36.34,,45.26,31.49,...,61.62,24.21,9.21,,26.03,0.92,14.79,4.79,20.78,71.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7WRV:H,94.01,30.03,53.58,6.17,50.85,6.11,31.33,,60.83,57.86,...,62.04,19.43,9.77,,34.47,0.72,35.94,4.05,75.82,
7X08:H,,,,,,,,,,14.22,...,57.97,23.15,13.50,,34.18,1.77,14.37,4.81,19.48,71.96
7X9E:H,,63.53,58.66,5.61,49.39,6.99,49.96,,53.55,39.05,...,57.35,23.72,5.00,,50.92,0.12,14.76,5.11,16.67,71.96
7Z0X:H,93.81,27.55,51.51,5.26,50.80,6.99,49.96,,53.16,36.68,...,62.04,19.43,9.77,,34.47,1.77,14.37,4.81,19.48,71.96


**Generate `sequence/position/sasa_prediction` dataframe (very nice structure for processing the results later on):**

In [24]:
res_df = u.positionize_sasa_df(predictions, ids)
res_df

Unnamed: 0,sequence_id,position,prediction
0,6LCS:H,1,
1,6LDV:H,1,
2,6LDW:H,1,
3,6LDX:H,1,96.45
4,6LDY:H,1,
...,...,...,...
105447,7WRV:H,149,
105448,7X08:H,149,71.96
105449,7X9E:H,149,71.96
105450,7Z0X:H,149,71.96


In [25]:
ids

0         6LCS:H
1         6LDV:H
2         6LDW:H
3         6LDX:H
4         6LDY:H
           ...  
105447    7WRV:H
105448    7X08:H
105449    7X9E:H
105450    7Z0X:H
105451    7Z0Y:H
Name: 200, Length: 105452, dtype: object

**Store it:**

In [76]:
preds_path = f'{TEST_PREDICTIONS_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
res_df.to_csv(preds_path)