## Baseline-решение предсказания свойств флуоресцентных белков

### Подготовка к работе

In [1]:
! pip install scikit-learn pandas numpy 



In [1]:
from ast import literal_eval
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

### Загрузка данных

Загрузим данные из csv файла. Разделеним на обучающую и тестовую выборки (80/20).

In [2]:
df = pd.read_csv("dataset.csv")
df_train, df_test = train_test_split(df, test_size=0.2, random_state=2009) # test_size - размер тестовой выборки

In [3]:
df.head()

Unnamed: 0,sequence,brightness,em_max,ex_max,ext_coeff,lifetime,maturation,pka,stokes_shift,qy,agg
0,[-2.26386846e-03 6.68725092e-03 3.58870206e-...,,525.0,513.0,30800.0,,,,12.0,,
1,[-1.83722540e-03 6.83323620e-03 3.56744439e-...,,512.0,502.0,33000.0,,,,10.0,,
2,[-3.97243258e-03 -2.33210856e-03 1.56618573e-...,,,,,,,,,,t
3,[-4.02820995e-03 5.53409010e-03 5.62726986e-...,,514.0,484.0,,,,,30.0,,
4,[-1.98068423e-03 5.59785357e-03 3.69466911e-...,,524.0,512.0,19400.0,,,,12.0,,


### Реализация метрик

In [42]:
def regress_metrics(y_test, y_pred):
    return [np.sqrt(mean_squared_error(y_test, y_pred))]

def classif_metrics(y_test, y_pred):
    return [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted'), recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')]

In [None]:
def preproc(row):
    seq = literal_eval(row['sequence'])
    seq = np.fromstring(
        row['sequence'].strip('[]'), 
        sep=' ', 
        dtype=np.float32
    )
    return seq

embeddings = df['column'].dropna().apply(preproc)

### KNN

Извлечем последовательности и целевые значения из датафрейма. Пишем только те у которых есть числовые данные в таргетах.

In [43]:
def get_target(df, target):
    sequences_list = []
    target_list = []

    for idx, row in df.iterrows():
        if pd.notna(row[target]):
            if isinstance(row['sequence'], str):
                try:
                    seq = literal_eval(row['sequence'])
                except (SyntaxError, ValueError):
                    seq = np.fromstring(
                        row['sequence'].strip('[]'), 
                        sep=' ', 
                        dtype=np.float32
                    )
            else:
                seq = row['sequence']
            
            sequences_list.append(seq)
            target_list.append(row[target])

    return [np.array(sequences_list), np.array(target_list)] # [эмбеддинги, свойства]

Настроим KNN регрессию, найдем лучшее количества соседей (k) методом кросс-валидации.

In [44]:
def regressor_knn(X_train, X_test, y_train, y_test):
    params = {'n_neighbors' : range(1, 30)}

    knn_regressor = KNeighborsRegressor()
    grid = GridSearchCV(knn_regressor, params, cv=5, scoring='neg_mean_squared_error') # MSE
    grid.fit(X_train, y_train)

    knn_regressor = KNeighborsRegressor(n_neighbors=grid.best_params_['n_neighbors'])
    knn_regressor.fit(X_train, y_train)
    predictions = knn_regressor.predict(X_test)

    return [regress_metrics(y_test, predictions), grid.best_params_['n_neighbors'], knn_regressor, predictions] # метрики, гиперпараметры и предсказанные значения

Настроим KNN классификацию, найдем лучшее количество соседей (k) методом кросс-валидации.

In [45]:
def classification_knn(X_train, X_test, y_train, y_test):
    params = {'n_neighbors' : range(1, 30)} # ищем k ближайщих соседей из диапазона 

    knn_classif = KNeighborsClassifier()
    grid = GridSearchCV(knn_classif, params, cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)

    #scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'] # метрики
    #results = cross_validate(knn_classif, X_train, y_train, cv=5, scoring=scoring)

    knn_classif = KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'])
    knn_classif.fit(X_train, y_train)
    predictions = knn_classif.predict(X_test)

    return [classif_metrics(y_test, predictions), grid.best_score_, grid.best_params_['n_neighbors'], knn_classif, predictions] # метрики, гиперпараметры и предсказанные значения

### Результаты и метрики

In [48]:
def metrics(results, target):
    if len(results) == 4:
        print(f'{target:<12} | RMSE: {results[0][0]}, k: {results[1]}')
    else:
        print(
            f'{target:<12} | accuracy: {results[0][0]}' + 
            f', precision: {results[0][1]}' + 
            f', recall: {results[0][2]}' +
            f', f1: {results[0][3]}'
        )

Выведем полученные метрики.

In [49]:
target = ['brightness', 'em_max', 'ex_max', 'ext_coeff', 'lifetime', 'maturation', 'pka', 'stokes_shift', 'qy', 'agg'] # свойства белка

for item in target:
    params_train = get_target(df_train, item)
    params_test = get_target(df_test, item)

    if item != 'agg':
        metrics(regressor_knn(params_train[0], params_test[0], params_train[1], params_test[1]), item)
    else:
        metrics(classification_knn(params_train[0], params_test[0], params_train[1], params_test[1]), item)

brightness   | RMSE: 25.412417005672154, k: 5
em_max       | RMSE: 35.23066413115074, k: 2
ex_max       | RMSE: 50.016655462949764, k: 4
ext_coeff    | RMSE: 33822.692336437874, k: 5
lifetime     | RMSE: 0.7843074616784187, k: 24
maturation   | RMSE: 287.7179410464352, k: 5
pka          | RMSE: 1.0688153072720366, k: 7
stokes_shift | RMSE: 36.83156211636082, k: 18
qy           | RMSE: 0.23116155185839865, k: 3
agg          | accuracy: 0.6230769230769231, precision: 0.587531305903399, recall: 0.6230769230769231, f1: 0.5995106780012441
