In [1]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch
import time

In [2]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

In [3]:
# mount drive to access data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from zipfile import ZipFile
with ZipFile('drive/MyDrive/Data.zip','r') as zipObj:
  zipObj.extractall('.')

In [5]:
class TrainDataset(Dataset):

    def __init__(self, data):
        self.data = data

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        feature = torch.from_numpy(self.data[idx]).float()
        return feature

In [6]:
def get_loaders(dataset, batch_size=64, shuffle=True, split = 0.8):
    
    assert 0 <= split <= 1
    
    features = pd.read_csv("Data/" + dataset + "_features.csv")
    features = features.drop(columns=['Id', 'smiles'])
    features = features.to_numpy()

    labels = pd.read_csv("Data/" + dataset + "_labels.csv")
    labels = labels.drop(columns=['Id'])
    labels = labels.to_numpy()

    combined = np.hstack((features, labels))

    if shuffle:
        np.random.shuffle(combined)

    split = int(split * combined.shape[0])

    train = combined[:split]
    val = combined[split:]

    print(train.shape[0])
    print(val.shape[0])

    #full_dataset = TrainDataset(combined)
    train_dataset = TrainDataset(train)
    validation_dataset = TrainDataset(val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0, pin_memory=True)
    val_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0, pin_memory=True)
    #full_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0, pin_memory=True)

    return train_loader, val_loader#, full_loader

In [7]:
class MolecularNet(nn.Module):

    def __init__(self):
        super().__init__()

        self.encoder = None
        self.decoder = None
        self.regressor = None

    def forward_enc(self, x):
        x = self.encoder(x)
        return x

    def forward_dec(self, x):
        x = self.decoder(x)
        return x

    def forward_reg(self, x):
        x = self.regressor(x)
        return x

In [8]:
reg = torch.load('drive/MyDrive/model_best_edr_1.pt', map_location=torch.device('cpu')).to(device)

In [9]:
loss_fn = nn.MSELoss()
optim = torch.optim.Adam(reg.parameters(), lr=1e-3)

In [10]:
train_loader, val_loader = get_loaders(dataset="train",batch_size=10,split=0.8)

80
20


In [11]:
def evaluate(model, loss_fn, val_loader, device):
    # goes through the test dataset and computes the test accuracy
    val_loss_cum = 0.0
    # bring the models into eval mode
    model.eval()
    y_batch_val = None

    with torch.no_grad():
        num_eval_samples = 0
        for x_batch_val in val_loader:

            y_batch_val = x_batch_val[:, -1]
            y_batch_val = torch.reshape(y_batch_val, (y_batch_val.shape[0], 1))
            y_batch_val = y_batch_val.to(device)

            x_batch_val = x_batch_val[:, :-1].to(device)
            
            x_val = model.forward_enc(x_batch_val)
            x_reg = model.forward_reg(x_val)

            loss = loss_fn(x_reg, y_batch_val)

            num_samples_batch = x_batch_val.shape[0]
            num_eval_samples += num_samples_batch
            val_loss_cum += loss * num_samples_batch

        avg_val_loss = val_loss_cum / num_eval_samples

        return avg_val_loss


In [12]:
def train_loop(model, train_loader, val_loader, loss_fn, optim, device, show=1, save=40, epochs=200):
    line = False
    print(f'Start training model')
    best_round = 0
    INF = 10e9
    cur_low_val_eval = INF
    for epoch in range(1,epochs+1):
        # reset statistics trackers
        train_loss_cum = 0.0
        num_samples_epoch = 0
        y_batch = None
        t = time.time()
        # Go once through the training dataset (-> epoch)

        for x_batch in train_loader:

            y_batch = x_batch[:, -1]
            y_batch = torch.reshape(y_batch, (y_batch.shape[0], 1))
            y_batch = y_batch.to(device)

            # move data to GPU
            x_batch = x_batch[:, :-1]
            x_batch = x_batch.to(device)

            # zero grads and put model into train mode
            optim.zero_grad()
            model.train()
            with torch.no_grad():
                # forward pass though the encoder
                x_enc = model.forward_enc(x_batch)

            # forward pass though the encoder
            x_reg = model.forward_reg(x_enc)

            # loss
            loss = loss_fn(x_reg, y_batch)

            # backward pass and gradient step
            loss.backward()
            optim.step()

            # keep track of train stats
            num_samples_batch = x_batch.shape[0]
            num_samples_epoch += num_samples_batch
            train_loss_cum += loss * num_samples_batch


        # average the accumulated statistics
        avg_train_loss = train_loss_cum / num_samples_epoch
        avg_train_loss = torch.sqrt(avg_train_loss)

        val_loss = evaluate(model, loss_fn, val_loader, device)
        val_loss = torch.sqrt(val_loss)
        epoch_duration = time.time() - t

        # print some infos
        if epoch % show == 0:
            line = True 
            print(f'Epoch {epoch} | Duration {epoch_duration:.2f} sec')
            print(f'Train loss:      {avg_train_loss:.4f}')
            print(f'Validation loss: {val_loss:.4f}')

        # save checkpoint of model
        if epoch % save == 0  and epoch > 2:
            line = True
            save_path = f'full_model_epoch_{epoch}.pt'
            torch.save(model, save_path)
            print(f'Saved model checkpoint to {save_path}')

        if cur_low_val_eval > val_loss and epoch > 2:
            cur_low_val_eval = val_loss
            best_round = epoch
            save_path = f'full_model_best.pt'
            torch.save(model, save_path)

        if line:
            print()
            line = False

    print(f'Lowess validation loss: {cur_low_val_eval:.4f} in Round {best_round}')

In [15]:
train_loop(reg, train_loader, val_loader, loss_fn, optim, device, show=500, save=1000, epochs=10000)

Start training model
Epoch 500 | Duration 0.02 sec
Train loss:      0.2822
Validation loss: 0.2036

Epoch 1000 | Duration 0.03 sec
Train loss:      0.1826
Validation loss: 0.1859
Saved model checkpoint to full_model_epoch_1000.pt

Epoch 1500 | Duration 0.02 sec
Train loss:      0.1460
Validation loss: 0.1844

Epoch 2000 | Duration 0.02 sec
Train loss:      0.0867
Validation loss: 0.1848
Saved model checkpoint to full_model_epoch_2000.pt

Epoch 2500 | Duration 0.03 sec
Train loss:      0.0681
Validation loss: 0.2007

Epoch 3000 | Duration 0.03 sec
Train loss:      0.0706
Validation loss: 0.2107
Saved model checkpoint to full_model_epoch_3000.pt

Epoch 3500 | Duration 0.03 sec
Train loss:      0.0635
Validation loss: 0.2063

Epoch 4000 | Duration 0.03 sec
Train loss:      0.0577
Validation loss: 0.2056
Saved model checkpoint to full_model_epoch_4000.pt

Epoch 4500 | Duration 0.03 sec
Train loss:      0.0564
Validation loss: 0.1996

Epoch 5000 | Duration 0.02 sec
Train loss:      0.0655
V

In [16]:
#Lowess validation loss: 0.1947 (80/20)
#Lowess validation loss: 0.1652 (90/10)
!cp full_model_best.pt drive/MyDrive/full_model_best.pt