## Baseline-решение предсказания свойств ФБ, с помощью KNN

### Подготовка к работе

In [None]:
! pip install --upgrade git+https://github.com/rimgro/biocadprotein.git

In [None]:
from fpgen.prop_prediction.dataset import FPbase
from fpgen.generation.metrics import identity

from fpgen.prop_prediction.metrics import get_regression_metrics, get_classification_metrics

from sklearn.model_selection import train_test_split, KFold

import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter

### Загрузка датасета

In [None]:
dataset = FPbase('../data/dataset.csv')
df_ident = pd.read_csv('../data/sequence_distance_matrix.csv', index_col=0)

### Реализация алгоритма KNN

Поиск расстояния между последовательнотсями аминокислот.

In [None]:
def ident(seq_1, seq_2):
    return df_ident.loc[seq_1.strip(), seq_2.strip()]

Реализация KNN в задаче регрессии.

In [None]:
def knn(x_train, y_train, x_test, k, knn_type):
    predictions = []
    for test_seq in x_test:
        similarities = []
        for train_seq, train_target in zip(x_train, y_train):
            sim = ident(test_seq, train_seq)
            similarities.append((train_target, sim))
        similarities.sort(key=lambda x: -x[1])
        neighbors = similarities[:k]
        if knn_type == 'reg':
            pred = np.mean([neighbor[0] for neighbor in neighbors])
            predictions.append(pred)
        elif knn_type == 'class':
            neighbor_classes = [neighbor[0] for neighbor in neighbors]
            most_common_class = Counter(neighbor_classes).most_common(1)[0][0]
            predictions.append(most_common_class)
    return predictions

Подбор гиперпараметров, методом кросс-валидации.

In [None]:
def cross_validate(x_train, y_train, kf_split, k_max, problem_type='class'):
    kf = KFold(n_splits=kf_split)
    
    if problem_type == 'reg':
        best_score = np.inf
    elif problem_type == 'class':
        best_score = -np.inf
    
    best_k = 1

    for k in tqdm(range(1, k_max + 1)):
        fold_scores = []

        for train_index, test_index in kf.split(x_train):
            X_train, X_test = x_train.iloc[train_index], x_train.iloc[test_index]
            y_train_fold, y_test = y_train.iloc[train_index], y_train.iloc[test_index]

            if problem_type == 'reg':
                predict = knn(X_train, y_train_fold, X_test, k, knn_type='reg')
            elif problem_type == 'class':
                predict = knn(X_train, y_train_fold, X_test, k, knn_type='class')

            valid_mask = ~pd.isna(predict)
            if sum(valid_mask) == 0:
                continue

            if problem_type == 'reg':
                metrics = get_regression_metrics(np.array(predict)[valid_mask], y_test[valid_mask])
                fold_scores.append(metrics['rmse'])
            elif problem_type == 'class':
                metrics = get_classification_metrics(np.array(predict)[valid_mask], y_test[valid_mask])
                fold_scores.append(metrics['accuracy'])

        if not fold_scores:
            continue
            
        mean_score = np.mean(fold_scores)
        
        if problem_type == 'reg' and mean_score < best_score:
            best_score = mean_score
            best_k = k
        elif problem_type == 'class' and mean_score > best_score:
            best_score = mean_score
            best_k = k
            
    return best_k

## Метрики

In [None]:
def metrics_reg(metrics):
    print(f'\t RMSE: {metrics['rmse']}')
    print(f'\t MAE: {metrics['mae']}')
    print(f'\t R2: {metrics['r2']}')
    print(f'\t MAE (med.): {metrics['mae_median']}')

def metrics_class(metrics):
    print(f'\t Accuracy: {metrics['accuracy']}')
    print(f'\t Precision: {metrics['precision']}')
    print(f'\t Recall: {metrics['recall']}')
    print(f'\t F1: {metrics['f1']}')

In [None]:
for item in dataset.targets:
    if item != 'agg' and item != 'switch_type':
        print(item)
        x_train, y_train = dataset.get_train(item)
        x_test, y_test = dataset.get_test(item)

        k = cross_validate(x_train, y_train, 4, 30, 'reg')
        print(f'k: {k}')
        y_pred = knn(x_train, y_train, x_test, k, 'reg')

        y_test_rescaled = dataset.rescale_targets(y_test, item)
        y_pred_rescaled = dataset.rescale_targets(y_pred, item)

        print('Scaled:')
        metrics_reg(get_regression_metrics(y_pred, y_test))
        print('Rescaled:')
        metrics_reg(get_regression_metrics(y_pred_rescaled, y_test_rescaled))
        print('')
    else:
        print(item)
        x_train, y_train = dataset.get_train(item, is_scaled=False)
        x_test, y_test = dataset.get_test(item, is_scaled=False)

        k = cross_validate(x_train, y_train, 4, 30, 'class')
        print(f'k: {k}')
        y_pred = knn(x_train, y_train, x_test, k, 'class')
        metrics_class(get_classification_metrics(y_pred, y_test))
        print('')
