# Обучение головы (FFN) на эмбеддингах

Если запускаете, полностью клонировав репозиторий локально

In [2]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

Если используете блокнот независимо от проекта

In [None]:
! pip install --upgrade git+https://github.com/rimgro/biocadprotein.git
clear_output()

Импорт библиотек

In [36]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import KFold

import optuna
from optuna.trial import Trial

from fpgen.prop_prediction.dataset import FPbase
from fpgen.prop_prediction.metrics import (
    get_regression_metrics,
    get_classification_metrics
)

## Подготовка данных

In [24]:
def preproc(line):
    clean_line = line.replace('\n', ' ').strip('[]')
    numbers = np.fromstring(clean_line, sep=' ')

    return numbers

In [31]:
dataset = FPbase('../data/dataset_embedd.csv', preprocess_function=preproc)
dataset.to_train_dataframe().head()

Unnamed: 0,sequence,brightness,em_max,ex_max,ext_coeff,lifetime,maturation,pka,stokes_shift,qy,agg,switch_type
558,"[-0.00530367903, 0.00854283385, 0.00128412445,...",-0.516789,-1.357357,-1.875798,-0.814071,,,0.32354,0.923046,-0.056729,m,b
149,"[-0.00167965586, 0.00413581124, 0.00335621182,...",-0.802832,-0.408006,-0.214689,-1.192834,,,1.878962,-0.403015,-0.465539,,b
184,"[-0.00447917636, -0.00560046919, 0.00733367307...",-1.040228,0.883734,0.758032,-0.257845,,,,0.074367,-1.725418,,b
291,"[-0.00244280021, 0.0043460303, 0.00649108412, ...",,-0.516948,-0.184759,,,,,-0.641706,,m,b
30,"[-0.00111842179, -0.00419556629, 0.00596778374...",,-0.610327,-0.244619,,,,,-0.694749,,t,b


## Обучение модели

In [40]:
TARGET = 'ex_max'
NUM_EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
NUM_FOLDS = 4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
class FNNRegressor(nn.Module):
    def __init__(self, input_size, hidden_sizes, dropout_rate):
        super().__init__()
        layers = []
        last_size = input_size

        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(last_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            last_size = hidden_size
            
        layers.append(nn.Linear(last_size, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x).squeeze(1)

In [42]:
X, y_series = dataset.get_train(TARGET, is_scaled=True)

y = y_series.values.reshape(-1, 1)

# --- Кросс-валидация ---
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=52)
all_metrics = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n--- Fold {fold+1}/{NUM_FOLDS} ---")

    # Подготовка данных
    X_train, y_train = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]

    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                  torch.tensor(y_train, dtype=torch.float32))
    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                                torch.tensor(y_val, dtype=torch.float32))

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Модель, оптимизатор, лосс
    model = FNNRegressor(input_dim=X.shape[1]).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    loss_fn = nn.MSELoss()

    # Обучение
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * xb.size(0)
        avg_loss = total_loss / len(train_loader.dataset)
        print(f"Epoch {epoch+1:3d} | Loss: {avg_loss:.4f}")

    # Оценка
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            pred = model(xb).cpu().numpy()
            preds.append(pred)
            targets.append(yb.numpy())
    preds = np.vstack(preds).flatten()
    targets = np.vstack(targets).flatten()

    # Метрики
    metrics = get_regression_metrics(y_pred=preds, y_true=targets)
    all_metrics.append(metrics)
    print(f"Fold {fold+1} Metrics: {metrics}")

# --- Средние метрики по фолдам ---
print("\n=== Average Metrics ===")
avg_metrics = {
    k: np.mean([m[k] for m in all_metrics])
    for k in all_metrics[0]
}
for name, value in avg_metrics.items():
    print(f"{name}: {value:.4f}")


--- Fold 1/4 ---


KeyError: '[2, 5, 16, 37, 40, 42, 46, 47, 69, 87, 92, 94, 107, 109, 114, 117, 128, 133, 136, 143, 144, 145, 155, 158, 167, 168, 172, 175, 179, 181, 191, 193, 208, 212, 220, 230, 232, 234, 241, 242, 243, 248, 249, 250, 251, 254, 255, 257, 260, 262, 270, 274, 278, 283, 285, 288, 289, 304, 316, 320, 338, 340, 352, 358, 359, 361, 389, 395, 403, 405, 406, 409, 411, 418, 419, 433, 436, 445, 455, 456, 466, 470, 471, 472, 475, 476, 482, 483, 485, 486, 488, 489, 491, 492, 500, 502, 503, 504, 506, 507, 510, 512, 514, 520, 523, 528, 536, 546, 553, 556, 557, 561, 572, 577, 583, 588, 590, 593, 594, 596, 601, 603, 609, 616, 624, 625, 630, 635, 642, 648, 657, 668, 669, 673, 674] not in index'