## Baseline-решение предсказания свойств ФБ, с помощью KNN

### Подготовка к работе

In [None]:
! pip install --upgrade git+https://github.com/rimgro/biocadprotein.git

In [None]:
from fpgen.prop_prediction.dataset import FPbase
from fpgen.generation.metrics import identity

from fpgen.prop_prediction.metrics import get_regression_metrics, get_classification_metrics

from sklearn.model_selection import train_test_split, KFold

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter

import ast

  from .autonotebook import tqdm as notebook_tqdm


### Загрузка датасета

In [2]:
dataset = FPbase('dataset.csv')
#df_ident = pd.read_csv('../data/sequence_distance_matrix.csv', index_col=0)

### Реализация алгоритма KNN

Реализация KNN в задаче регрессии и классификации.

In [32]:
def knn(x_train, y_train, x_test, knn_type):
    params = {'n_neighbors' : range(1, 30)}

    if knn_type == 'reg':

        knn_regressor = KNeighborsRegressor()
        grid = GridSearchCV(knn_regressor, params, cv=5, scoring='neg_mean_squared_error') # MSE
        grid.fit(x_train, y_train)

        knn_regressor = KNeighborsRegressor(n_neighbors=grid.best_params_['n_neighbors'])
        knn_regressor.fit(x_train, y_train)
        predictions = knn_regressor.predict(x_test)

    elif knn_type == 'class':

        knn_classif = KNeighborsClassifier()
        grid = GridSearchCV(knn_classif, params, cv=5, scoring='accuracy')
        grid.fit(x_train, y_train)

        scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'] # метрики
        results = cross_validate(knn_classif, x_train, y_train, cv=5, scoring=scoring)

        knn_classif = KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'])
        knn_classif.fit(x_train, y_train)
        predictions = knn_classif.predict(x_test)
        
    return predictions

## Метрики

In [4]:
def metrics_reg(metrics):
    print(f'\t RMSE: {metrics['rmse']}')
    print(f'\t MAE: {metrics['mae']}')
    print(f'\t R2: {metrics['r2']}')
    print(f'\t MAE (med.): {metrics['mae_median']}')

def metrics_class(metrics):
    print(f'\t Accuracy: {metrics['accuracy']}')
    print(f'\t Precision: {metrics['precision']}')
    print(f'\t Recall: {metrics['recall']}')
    print(f'\t F1: {metrics['f1']}')

In [28]:
def preproc(data):
    processed = []
    for line in data:
        clean_line = line.replace('\n', ' ').strip('[]')
        numbers = np.fromstring(clean_line, sep=' ')
        processed.append(numbers.tolist())
    return np.array(processed)

In [33]:
for item in dataset.targets:
    if item != 'agg' and item != 'switch_type':
        print(item)
        x_train, y_train = dataset.get_train(item)
        x_test, y_test = dataset.get_test(item)

        x_train_processed = preproc(x_train)
        x_test_processed = preproc(x_test)
        
        y_pred = knn(x_train_processed, y_train, x_test_processed, 'reg')

        y_test_rescaled = dataset.rescale_targets(y_test, item)
        y_pred_rescaled = dataset.rescale_targets(y_pred, item)

        print('Scaled:')
        metrics_reg(get_regression_metrics(y_pred, y_test))
        print('Rescaled:')
        metrics_reg(get_regression_metrics(y_pred_rescaled, y_test_rescaled))
        print('')
    else:
        print(item)
        x_train, y_train = dataset.get_train(item, is_scaled=False)
        x_test, y_test = dataset.get_test(item, is_scaled=False)

        x_train_processed = preproc(x_train)
        x_test_processed = preproc(x_test)
        
        y_pred = knn(x_train_processed, y_train, x_test_processed, 'class')
        metrics_class(get_classification_metrics(y_pred, y_test))
        print('')


brightness
Scaled:
	 RMSE: 0.8649779640509915
	 MAE: 0.5486206497799242
	 R2: 0.3045003834261304
	 MAE (med.): 0.3480402924762287
Rescaled:
	 RMSE: 26.671208434412037
	 MAE: 16.916472222222225
	 R2: 0.3045003834261306
	 MAE (med.): 10.731666666666667

em_max
Scaled:
	 RMSE: 0.5803172595847197
	 MAE: 0.3649787673252432
	 R2: 0.6237802764003194
	 MAE (med.): 0.20854598241252914
Rescaled:
	 RMSE: 37.28794574931144
	 MAE: 23.451497005988028
	 R2: 0.6237802764003193
	 MAE (med.): 13.399999999999977

ex_max
Scaled:
	 RMSE: 0.7192895445138199
	 MAE: 0.4948954361019093
	 R2: 0.452173517072031
	 MAE (med.): 0.30179293745218827
Rescaled:
	 RMSE: 48.06498324145432
	 MAE: 33.07032755298651
	 R2: 0.4521735170720308
	 MAE (med.): 20.16666666666663

ext_coeff
Scaled:
	 RMSE: 0.954774649612648
	 MAE: 0.6621336212847821
	 R2: 0.2586937844734908
	 MAE (med.): 0.47946678493033645
Rescaled:
	 RMSE: 36046.98229471582
	 MAE: 24998.484126984127
	 R2: 0.2586937844734908
	 MAE (med.): 18102.0

lifetime
Scaled:

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


	 Accuracy: 0.7762237762237763
	 Precision: 0.7624345351618079
	 Recall: 0.7762237762237763
	 F1: 0.7646065758365224

switch_type
	 Accuracy: 0.864406779661017
	 Precision: 0.7981510015408321
	 Recall: 0.864406779661017
	 F1: 0.8291210755917899



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
