## Protein Embeddings Manipulation

In [7]:
from protera_stability.data import ProteinStabilityDataset
from pathlib import Path

data_path = Path("../data") 

!ls $data_path

dtm_Thermonuclease.h5	    fireprotdb_results.csv  tm_fireprot.h5
embeddings_tm_fireprot.pkl  stability_fireprot.h5   tm_Thermonuclease.h5


### Get Dataset

In [8]:
train_set = ProteinStabilityDataset(proteins_path=data_path / "tm_fireprot.h5")
# test_set =  ProteinStabilityDataset(proteins_path=data_path / "stability_test.h5")
len(train_set)

4338

## Can a FeedForward Net do better?


In [107]:
from torch import nn
import torch

class ProteinMLP(nn.Module):
    def __init__(self, n_in = 1280, n_units = 100, act = None, drop_p = 0.7, last_drop = True):
        super(ProteinMLP, self).__init__()
        self.fc1 = nn.Linear(n_in, n_units)
        self.fc2 = nn.Linear(n_units, 1)
        self.fc3 = nn.Linear(n_units // 2, 1)
        
        self.drop = nn.Dropout(p=drop_p)

        self.last_drop = last_drop
        self.act = act
        if act is None:
            self.act = nn.ReLU6()
            
    def forward(self, x):
        out = self.drop(self.act(self.fc1(x)))
        out = self.drop(self.act(self.fc2(out)))
        
        if self.last_drop: 
            out = self.drop(out)
        return self.fc3(out)

In [142]:
from skorch.callbacks import Checkpoint, LRScheduler, EarlyStopping
from skorch import NeuralNetRegressor

cb = Checkpoint(dirname='models')
sched = LRScheduler(
    step_every="batch"
)
stopper = EarlyStopping(patience=15)

net = NeuralNetRegressor(
    ProteinMLP,
    criterion=nn.MSELoss,
    # optimizer=torch.optim.Adam,
    optimizer__momentum=0.1,
    optimizer__weight_decay=1e-2,
    lr=3e-3,
    module__n_units=512,
    module__drop_p=0.4,
    iterator_train__shuffle=True,
    device='cuda:1',
    callbacks=[cb, sched, stopper],
)

net.fit(train_set.X, train_set.y.reshape(-1,1))

  epoch    train_loss    valid_loss    cp     dur
-------  ------------  ------------  ----  ------
      1        [36m0.7863[0m        [32m0.6606[0m     +  0.0678
      2        [36m0.6082[0m        0.7344        0.0663
      3        [36m0.5518[0m        0.8096        0.0650
      4        [36m0.5289[0m        0.7126        0.0646
      5        [36m0.4957[0m        [32m0.5123[0m     +  0.0700
      6        0.4999        0.5869        0.0691
      7        0.5003        0.6379        0.0642
      8        [36m0.4757[0m        0.7397        0.0711
      9        [36m0.4542[0m        0.5196        0.0645
     10        [36m0.4441[0m        [32m0.5048[0m     +  0.0665


<class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=ProteinMLP(
    (fc1): Linear(in_features=1280, out_features=512, bias=True)
    (fc2): Linear(in_features=512, out_features=256, bias=True)
    (fc3): Linear(in_features=256, out_features=1, bias=True)
    (drop): Dropout(p=0.4, inplace=False)
    (act): ReLU6()
  ),
)

In [143]:
net.score(train_set.X, train_set.y)

0.5876939916287223

In [113]:
# net.score(test_set.X, test_set.y)