## Baseline-решение предсказания свойств ФБ, с помощью kNN (эмбеддинги (ESM C))

### Подготовка к работе

In [None]:
! pip install --upgrade git+https://github.com/rimgro/biocadprotein.git

In [8]:
from fpgen.prop_prediction.dataset import FPbase
from fpgen.generation.metrics import identity

from fpgen.prop_prediction.metrics import get_regression_metrics, get_classification_metrics, bootstrap_metric_ci

from sklearn.model_selection import train_test_split, KFold

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter

import ast

### Загрузка датасета

In [9]:
dataset = FPbase('data/fpbase_embedd.csv')

### Реализация алгоритма KNN

Реализация KNN в задаче регрессии и классификации.

In [10]:
def knn(x_train, y_train, x_test, knn_type):
    params = {'n_neighbors' : range(1, 30)}

    if knn_type == 'reg':

        knn_regressor = KNeighborsRegressor()
        grid = GridSearchCV(knn_regressor, params, cv=4, scoring='neg_mean_squared_error') # MSE
        grid.fit(x_train, y_train)

        knn_regressor = KNeighborsRegressor(n_neighbors=grid.best_params_['n_neighbors'])
        knn_regressor.fit(x_train, y_train)
        predictions = knn_regressor.predict(x_test)

    elif knn_type == 'class':

        knn_classif = KNeighborsClassifier()
        grid = GridSearchCV(knn_classif, params, cv=4, scoring='accuracy')
        grid.fit(x_train, y_train)

        scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'] # метрики
        results = cross_validate(knn_classif, x_train, y_train, cv=5, scoring=scoring)

        knn_classif = KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'])
        knn_classif.fit(x_train, y_train)
        predictions = knn_classif.predict(x_test)
        
    return predictions

## Метрики

In [13]:
def metrics_reg(metrics):
    print(f'\t RMSE: {metrics["rmse"]}')
    print(f'\t MAE: {metrics["mae"]}')
    print(f'\t R2: {metrics["r2"]}')
    print(f'\t MAE (med.): {metrics["mae_median"]}')

def metrics_class(metrics):
    print(f'\t Accuracy: {metrics["accuracy"]}')
    print(f'\t Precision: {metrics["precision"]}')
    print(f'\t Recall: {metrics["recall"]}')
    print(f'\t F1: {metrics["f1"]}')

In [12]:
def preproc(data):
    processed = []
    for line in data:
        clean_line = line.replace('\n', ' ').strip('[]')
        numbers = np.fromstring(clean_line, sep=' ')
        processed.append(numbers.tolist())
    return np.array(processed)

In [14]:
for item in dataset.targets:
    if item != 'agg' and item != 'switch_type':
        print(item)
        x_train, y_train = dataset.get_train(item)
        x_test, y_test = dataset.get_test(item)

        x_train_processed = preproc(x_train)
        x_test_processed = preproc(x_test)
        
        y_pred = knn(x_train_processed, y_train, x_test_processed, 'reg')

        y_test_rescaled = dataset.rescale_targets(y_test, item)
        y_pred_rescaled = dataset.rescale_targets(y_pred, item)

        print('Scaled:')
        metrics_reg(get_regression_metrics(y_pred, y_test))
        print('Rescaled:')
        metrics_reg(get_regression_metrics(y_pred_rescaled, y_test_rescaled))
        print('')
        
        metrics_ci = bootstrap_metric_ci(
            y_pred_rescaled, y_test_rescaled, get_regression_metrics, 
            n_bootstrap=1000, alpha=0.05, random_state=42
        )

        print("\nMetrics with 95% confidence intervals:")
        print(f"RMSE: {(metrics_ci['rmse'][1] + metrics_ci['rmse'][2]) / 2:.2f} ± {(metrics_ci['rmse'][1] + metrics_ci['rmse'][2]) / 2 -metrics_ci['rmse'][1]:.2f}")
        print(f"MAE: {(metrics_ci['mae'][1] + metrics_ci['mae'][2]) / 2:.2f} ± {(metrics_ci['mae'][1] + metrics_ci['mae'][2]) / 2 -metrics_ci['mae'][1]:.2f}")
        print(f"R2: {(metrics_ci['r2'][1] + metrics_ci['r2'][2]) / 2:.2f} ± {(metrics_ci['r2'][1] + metrics_ci['r2'][2]) / 2 -metrics_ci['r2'][1]:.2f}")
        print(f"MAE Median: {(metrics_ci['mae_median'][1] + metrics_ci['mae_median'][2]) / 2:.2f} ± {(metrics_ci['mae_median'][1] + metrics_ci['mae_median'][2]) / 2 -metrics_ci['mae_median'][1]:.2f}")

    else:
        print(item)
        x_train, y_train = dataset.get_train(item, is_scaled=False)
        x_test, y_test = dataset.get_test(item, is_scaled=False)

        x_train_processed = preproc(x_train)
        x_test_processed = preproc(x_test)
        
        y_pred = knn(x_train_processed, y_train, x_test_processed, 'class')
        metrics_class(get_classification_metrics(y_pred, y_test))

        print('')


brightness
Scaled:
	 RMSE: 0.8897936198409858
	 MAE: 0.5285885589696336
	 R2: 0.26402106819229165
	 MAE (med.): 0.30623113480727954
Rescaled:
	 RMSE: 27.43638807541911
	 MAE: 16.298791666666666
	 R2: 0.26402106819229165
	 MAE (med.): 9.442499999999999


Metrics with 95% confidence intervals:
RMSE: 27.16 ± 8.59
MAE: 16.49 ± 3.86
R2: 0.23 ± 0.34
MAE Median: 9.23 ± 3.48
em_max
Scaled:
	 RMSE: 0.5803172595847197
	 MAE: 0.3649787673252432
	 R2: 0.6237802764003194
	 MAE (med.): 0.20854598241252914
Rescaled:
	 RMSE: 37.28794574931144
	 MAE: 23.451497005988028
	 R2: 0.6237802764003193
	 MAE (med.): 13.399999999999977


Metrics with 95% confidence intervals:
RMSE: 36.89 ± 7.01
MAE: 23.65 ± 4.30
R2: 0.61 ± 0.15
MAE Median: 13.30 ± 4.70
ex_max
Scaled:
	 RMSE: 0.6634388019103676
	 MAE: 0.44404636093858496
	 R2: 0.5339449006457315
	 MAE (med.): 0.27435721586562567
Rescaled:
	 RMSE: 44.33287698781457
	 MAE: 29.672447013487478
	 R2: 0.5339449006457314
	 MAE (med.): 18.333333333333314


Metrics with 9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


	 Accuracy: 0.7762237762237763
	 Precision: 0.7624345351618079
	 Recall: 0.7762237762237763
	 F1: 0.7646065758365224

switch_type
	 Accuracy: 0.9378531073446328
	 Precision: 0.9359428930618761
	 Recall: 0.9378531073446328
	 F1: 0.9337233893429385

