## Baseline-решение предсказания свойств ФБ, с помощью KNN

### Подготовка к работе

In [None]:
! pip install --upgrade git+https://github.com/rimgro/biocadprotein.git

In [9]:
from fpgen.prop_prediction.dataset import FPbase
from fpgen.generation.metrics import identity

from fpgen.prop_prediction.metrics import get_regression_metrics

from sklearn.model_selection import train_test_split, KFold

import pandas as pd
import numpy as np
from tqdm import tqdm

### Загрузка датасета

In [None]:
dataset = FPbase('dataset.csv')
df_ident = pd.read_csv('sequence_distance_matrix.csv', index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: 'dataset.csv'

### Реализация алгоритма KNN

Поиск расстояния между последовательнотсями аминокислот.

In [3]:
def ident(seq_1, seq_2):
    return df_ident.loc[seq_1.strip(), seq_2.strip()]

Реализация KNN в задаче регрессии.

In [None]:
def knn_regressor(x_train, y_train, x_test, k):
    predictions = []
    for test_seq in x_test:
        similarities = []
        for train_seq, train_target in zip(x_train, y_train):
            sim = ident(test_seq, train_seq)
            similarities.append((train_target, sim))
        similarities.sort(key=lambda x: -x[1])
        neighbors = similarities[:k]

        pred = np.mean([neighbor[0] for neighbor in neighbors])
        predictions.append(pred)

    return predictions

Подбор гиперпараметров, методом кросс-валидации.

In [None]:
def cross_validate(x_train, y_train, kf_split, k_max):
    kf = KFold(n_splits=kf_split)
    best_k = 1
    best_score = np.inf

    for k in tqdm(range(1, k_max + 1)):
        fold_scores = []

        for train_index, test_index in kf.split(x_train):
            X_train, X_test = x_train.iloc[train_index], x_train.iloc[test_index]
            y_train_fold, y_test = y_train.iloc[train_index], y_train.iloc[test_index]

            predict = knn_regressor(X_train, y_train_fold, X_test, k)

            valid_mask = ~pd.isna(predict)
            if sum(valid_mask) == 0:
                continue
                
            metrics = get_regression_metrics(
                np.array(predict)[valid_mask], 
                y_test[valid_mask], 
            )
            fold_scores.append(metrics['rmse'])

        if not fold_scores:
            continue
            
        mean_score = np.mean(fold_scores)
        
        if mean_score < best_score:
            best_score = mean_score
            best_k = k
    return best_k

## Метрики

In [6]:
def metrics_reg(metrics):
    print(f'\t RMSE: {metrics['rmse']}')
    print(f'\t MAE: {metrics['mae']}')
    print(f'\t R2: {metrics['r2']}')
    print(f'\t MAE (med.): {metrics['mae_median']}')

In [None]:
for item in dataset.properties:
    if item != 'agg':
        print(item)
        x_train, y_train = dataset.get_train(item)
        x_test, y_test = dataset.get_test(item)

        k = cross_validate(x_train, y_train, 4, 30)
        print(f'k: {k}')
        y_pred = knn_regressor(x_train, y_train, x_test, k)

        y_test_rescaled = dataset.rescale_targets(y_test, item)
        y_pred_rescaled = dataset.rescale_targets(y_pred, item)

        print('Scaled:')
        metrics_reg(get_regression_metrics(y_pred, y_test))
        print('Rescaled:')
        metrics_reg(get_regression_metrics(y_pred_rescaled, y_test_rescaled))
        print('')

brightness


100%|██████████| 30/30 [00:35<00:00,  1.17s/it]


k: 5
Scaled:
	 RMSE: 0.7412173589207028
	 MAE: 0.507162649397322
	 R2: 0.48928575551555786
	 MAE (med.): 0.3323219950617097
Rescaled:
	 RMSE: 22.855105559444116
	 MAE: 15.638133333333334
	 R2: 0.48928575551555775
	 MAE (med.): 10.247

em_max


100%|██████████| 30/30 [01:09<00:00,  2.32s/it]


k: 1
Scaled:
	 RMSE: 0.5277660222068801
	 MAE: 0.24677351033531916
	 R2: 0.6888330674145302
	 MAE (med.): 0.03112626603172086
Rescaled:
	 RMSE: 33.91129675940145
	 MAE: 15.8562874251497
	 R2: 0.6888330674145303
	 MAE (med.): 2.0

ex_max


100%|██████████| 30/30 [01:18<00:00,  2.61s/it]


k: 3
Scaled:
	 RMSE: 0.560141054302693
	 MAE: 0.3526708467947944
	 R2: 0.6677765093830835
	 MAE (med.): 0.1795792685665914
Rescaled:
	 RMSE: 37.43022624652124
	 MAE: 23.566473988439306
	 R2: 0.6677765093830835
	 MAE (med.): 12.0

ext_coeff


100%|██████████| 30/30 [00:37<00:00,  1.26s/it]


k: 2
Scaled:
	 RMSE: 0.8337854146769339
	 MAE: 0.5829458364789102
	 R2: 0.4346668268309095
	 MAE (med.): 0.35492514186645163
Rescaled:
	 RMSE: 31479.10147451571
	 MAE: 22008.79365079365
	 R2: 0.4346668268309094
	 MAE (med.): 13400.0

lifetime


100%|██████████| 30/30 [00:04<00:00,  7.48it/s]


k: 4
Scaled:
	 RMSE: 1.5785460811112475
	 MAE: 0.6631146885927949
	 R2: 0.3601461220166019
	 MAE (med.): 0.245165641909308
Rescaled:
	 RMSE: 1.8350272477541036
	 MAE: 0.7708571428571429
	 R2: 0.3601461220166018
	 MAE (med.): 0.28500000000000014

maturation


100%|██████████| 30/30 [00:03<00:00,  9.84it/s]


k: 20
Scaled:
	 RMSE: 0.47439700100830845
	 MAE: 0.3670827142930148
	 R2: 0.07842265056394604
	 MAE (med.): 0.3182861769729677
Rescaled:
	 RMSE: 100.49515218407302
	 MAE: 77.76194444444444
	 R2: 0.07842265056394582
	 MAE (med.): 67.425

pka


100%|██████████| 30/30 [00:15<00:00,  1.89it/s]


k: 12
Scaled:
	 RMSE: 1.230376790021117
	 MAE: 0.8062922685857539
	 R2: 0.10611251314067638
	 MAE (med.): 0.48329206908236994
Rescaled:
	 RMSE: 1.3842920717574583
	 MAE: 0.9071562499999999
	 R2: 0.10611251314067627
	 MAE (med.): 0.5437499999999997

stokes_shift


100%|██████████| 30/30 [01:10<00:00,  2.35s/it]


k: 3
Scaled:
	 RMSE: 0.6036733152929479
	 MAE: 0.35922762135108044
	 R2: 0.36456125512938475
	 MAE (med.): 0.1326061290133299
Rescaled:
	 RMSE: 22.7618934277263
	 MAE: 13.544910179640718
	 R2: 0.36456125512938464
	 MAE (med.): 5.0

qy


100%|██████████| 30/30 [00:48<00:00,  1.63s/it]


k: 3
Scaled:
	 RMSE: 0.6473622170033797
	 MAE: 0.4850672240569276
	 R2: 0.5646155386352791
	 MAE (med.): 0.38403385714680943
Rescaled:
	 RMSE: 0.17418801627025163
	 MAE: 0.13051873479318737
	 R2: 0.5646155386352794
	 MAE (med.): 0.10333333333333339

