# 3b. Predict on the TEST set

## Setup

In [192]:
import sys
sys.path.append('../..')
import math
import pickle
import time

#import abnumber
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

import bin.feature_generators as fg
import bin.params as p
import bin.utils as u

In [193]:
sns.set_theme()

**Papermill parameters:**

In [194]:
# Parameters
MODEL_NAME = "BLavgpos"
FEATURES = "lco_whole_sequence_all_H"


In [195]:
PARAMS = {
    'compress': False,
    'preserve_seq_ids': True,
}

**Create directories to store results in:**

In [196]:
DATASET = 'test'
TEST_Y_RAW_DIR_PATH = f'{p.DATA_DIR}/csv/test_{DATASET}/raw_y'
TEST_PREDICTIONS_RAW_DIR_PATH = f'{p.DATA_DIR}/csv/test_{DATASET}/raw_predictions'
TEST_PREDICTIONS_DIR_PATH = f'{p.DATA_DIR}/csv/test_{DATASET}/predictions'

command = f'mkdir -p {TEST_Y_RAW_DIR_PATH} {TEST_PREDICTIONS_RAW_DIR_PATH} {TEST_PREDICTIONS_DIR_PATH}'
! $command

(TEST_Y_RAW_DIR_PATH, TEST_PREDICTIONS_RAW_DIR_PATH, TEST_PREDICTIONS_DIR_PATH)

('../../data/csv/test_test/raw_y',
 '../../data/csv/test_test/raw_predictions',
 '../../data/csv/test_test/predictions')

**Load the test dataset:**

In [197]:
chains = FEATURES.split('_')[-1]
X_orig, Y_orig = u.load_dataset(DATASET, chains=chains)
X, Y = X_orig.copy(), Y_orig.copy()

load_dataset: test, metadata file path: ../../data/csv/metadata/metadata_H.csv, chains: H, shape: (3286, 19)
load_dataset: test, X file path: ../../data/csv/fasta_aligned_cleaned/fasta_aho_H.csv, chains: H, shape: (3205, 165)
load_dataset: test, Y file path: ../../data/csv/sasa_aligned/sasa_H.csv, chains: H, shape: (3205, 165)


**Transform the data based on which feature representation did we choose:**

In [198]:
X, Y, _ = fg.generate(X, Y, c=None, model_name=MODEL_NAME, 
                   features=FEATURES, params=PARAMS)
ids = X[X.columns[-1]].reset_index(drop=True)
X.drop(columns = X.columns[-1], inplace=True)
X.shape, Y.shape, ids.shape

lco_whole_sequence_all_H
X.shape (562, 165) Y.shape (562, 165)
after non-data column drop: X.shape (562, 164) Y.shape (562, 164)
[NOTE] Skipping one-hot encoding, since this is baseline model
[FINAL] X.shape (562, 165) Y.shape (562, 164)


((562, 164), (562, 164), (562,))

In [199]:
N_SEQUENCES = len(ids.unique())
N_POSITIONS = len(X_orig.columns)-1 # -1 -> since ID column does not count
N_SEQUENCES, N_POSITIONS

(562, 164)

In [200]:
X.head(n=1)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
0,-,V,K,L,Q,Q,S,-,G,P,...,Q,G,T,-,T,V,T,V,C,S


In [201]:
Y if type(Y) is np.ndarray else Y.head(n=1)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
0,-1.0,46.0,61.0,3.6,36.8,2.8,51.0,-1.0,51.2,55.2,...,54.0,14.6,10.4,-1.0,49.5,1.3,19.9,6.3,4.4,64.2


In [202]:
ids.head(n=3)

0    6LCS:H
1    6LDV:H
2    6LDW:H
Name: sequence_id, dtype: object

**Load the trained model:**

In [203]:
with open(f'{p.DATA_DIR}/pickles/trained-test-models/{FEATURES}_{MODEL_NAME}.p', 
          'rb') as trained_model_file:
    model = pickle.load(trained_model_file)

---

## Predict

In [204]:
predict_start = time.time()
predictions = model.predict(X).round(2)
predict_end = time.time()
print(f'Prediction took {predict_end-predict_start:.2f}s, the result datatype is {type(predictions)} and shape {predictions.shape}')

Prediction took 0.50s, the result datatype is <class 'pandas.core.frame.DataFrame'> and shape (562, 164)


**If predictions are stored in `np.array`, perform the conversion to the `whole_sequence`-like dataframe:**

In [205]:
predictions

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
0,,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77
1,,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77
2,,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77
3,75.84,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77
4,,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557,75.84,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,
558,,,,,,,,,,,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77
559,,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77
560,75.84,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77


In [206]:
if type(predictions) is np.ndarray:
    print('predictions are np.ndarray')
    # convert to dataframe
    Y_pred = Y_orig.copy()
    Y_pred.index = Y_orig['Id']
    Y_pred.drop(columns='Id', inplace=True)
    for i, p in tqdm(enumerate(predictions), total=len(predictions), 
                     desc='Processing individual predictions...'):
        seq_id = ids.iloc[i % N_SEQUENCES]
        x_index = math.floor(i / N_SEQUENCES)
        pos_id = X_orig.columns[x_index+1] # starting from 1 as 0 is 'id'
        Y_pred.loc[seq_id, pos_id] = predictions[i]
    Y_pred = Y_pred.replace(-1, np.nan)
    predictions = Y_pred
else:
    print('prediction were not np.ndarray')

prediction were not np.ndarray


**Replace `-1` values by `np.nan`:**

In [207]:
predictions = predictions.replace(-1, np.nan)

In [208]:
predictions.head(n=2)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
0,,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77
1,,32.11,53.66,6.07,51.41,7.87,42.1,,53.53,47.72,...,62.77,19.15,8.61,,34.08,1.65,18.04,5.16,21.96,69.77


**Save raw `y` and `predictions` objects so we can use the raw data in other notebooks:**

In [209]:
raw_y_path = f'{TEST_Y_RAW_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
Y_orig.to_csv(raw_y_path)

In [210]:
raw_preds_path = f'{TEST_PREDICTIONS_RAW_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
predictions.to_csv(raw_preds_path)

**Generate `sequence/position/sasa_prediction` dataframe (very nice structure for processing the results later on):**

In [211]:
ids

0      6LCS:H
1      6LDV:H
2      6LDW:H
3      6LDX:H
4      6LDY:H
        ...  
557    7WRV:H
558    7X08:H
559    7X9E:H
560    7Z0X:H
561    7Z0Y:H
Name: sequence_id, Length: 562, dtype: object

In [212]:
res_df = u.positionize_sasa_df(predictions, ids)
res_df

Unnamed: 0,sequence_id,position,prediction
0,6LCS:H,1,
1,6LDV:H,1,
2,6LDW:H,1,
3,6LDX:H,1,75.84
4,6LDY:H,1,
...,...,...,...
92163,7WRV:H,149,
92164,7X08:H,149,69.77
92165,7X9E:H,149,69.77
92166,7Z0X:H,149,69.77


**Store it:**

In [213]:
preds_path = f'{TEST_PREDICTIONS_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
print('preds_path:', preds_path)
res_df.to_csv(preds_path)

preds_path: ../../data/csv/test_test/predictions/lco_whole_sequence_all_H_BLavgpos.csv
