## Предсказания свойств ФБ, с помощью CNN (эмбеддинги (ESM C))

### Подготовка к работе

In [14]:
! pip install torch tqdm
#! pip install --upgrade git+https://github.com/rimgro/biocadprotein.git



In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [23]:
from fpgen.prop_prediction.dataset import FPbase
from fpgen.prop_prediction.metrics import get_regression_metrics

from torch.utils.data import TensorDataset, DataLoader

### Загрузка датасета

In [17]:
dataset = FPbase('../data/dataset.csv')

In [18]:
def load_dataset(target):
    x_train, y_train = dataset.get_train(target)
    x_test, y_test = dataset.get_test(target)

    return x_train, y_train, x_test, y_test

In [20]:
import pickle

with open('../data/sequence.pickle', 'rb') as file:
    seq = pickle.load(file)

### Подготовка данных (padding эмбеддингов)

In [30]:
def preprocessing_dataset(x_tr, y_train, x_t, y_test):
    matrix_tr = []
    for i in range(len(x_tr)):
        matrix_tr.append(seq[x_tr.iloc[i]])

    matrix_t = []
    for i in range(len(x_t)):
        matrix_t.append(seq[x_t.iloc[i]])

    max_h = max(max(t.shape[1] for t in matrix_tr), max(t.shape[1] for t in matrix_t))
    max_w = max(max(t.shape[2] for t in matrix_tr), max(t.shape[2] for t in matrix_t))

    def pad_tensor_list(tensor_list):
        padded = []
        for t in tensor_list:
            c, h, w = t.shape
            pad_h = max_h - h
            pad_w = max_w - w
            padded_tensor = F.pad(t, (0, pad_w, 0, pad_h))
            padded.append(padded_tensor)
        return torch.stack(padded)

    x_train_p = pad_tensor_list(matrix_tr)
    x_test_p = pad_tensor_list(matrix_t)

    y_train_p = torch.tensor(y_train.to_numpy(), dtype=torch.float32).view(-1, 1)
    y_test_p = torch.tensor(y_test.to_numpy(), dtype=torch.float32).view(-1, 1)

    return x_train_p, y_train_p, x_test_p, y_test_p

### Архитектура CNN

In [24]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, padding=2)
        self.pool = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, padding=2)
        self.fc1 = nn.Linear(32 * 184 * 240, 128) 
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

### Обучение модели

In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [43]:
def learning_model(train_loader, test_loader):
    model = CNN()
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_fn = nn.MSELoss()

    for epoch in tqdm(range(100)):
        model.train()
        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            preds = model(xb)
            loss = loss_fn(preds, yb)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch}: loss = {loss.item():.4f}")

    model = model.to('cpu')
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for xb, yb in test_loader:
            preds = model(xb)
            y_true.extend(yb.cpu().numpy().flatten())
            y_pred.extend(preds.cpu().numpy().flatten())

    return y_pred, y_true


### Тестирование и метрики

In [27]:
def get_metrics(y_pred, y_test, target):
    metrics = get_regression_metrics(y_pred, y_test)
    print(f'\t RMSE: {metrics['rmse']}')
    print(f'\t MAE: {metrics['mae']}')
    print(f'\t R2: {metrics['r2']}')
    print(f'\t MAE (med.): {metrics['mae_median']}')


In [31]:
for item in dataset.targets:
    if item != 'agg' and item != 'switch_type':
        print(item)
        x_train, y_train, x_test, y_test = load_dataset(item)

        x_train_p, y_train_p, x_test_p, y_test_p = preprocessing_dataset(x_train, y_train, x_test, y_test)
        
        dataset_train = TensorDataset(x_train_p, y_train_p)
        train_loader = DataLoader(dataset_train, batch_size=32, shuffle=True)

        dataset_test = TensorDataset(x_test_p, y_test_p)
        test_loader = DataLoader(dataset_test, batch_size=32, shuffle=True)

        y_pred, y_true = learning_model(train_loader, test_loader)

        get_metrics(y_pred, y_true)


brightness


AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [None]:
y_true_rescaled = dataset.rescale_targets(y_true, 'em_max')
y_pred_rescaled = dataset.rescale_targets(y_pred, 'em_max')

get_regression_metrics(y_pred_rescaled, y_true_rescaled)

{'rmse': 32.17417,
 'mae': 22.368298,
 'r2': 0.7198959653410872,
 'mae_median': 14.933685}