In [None]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch
import time
import random
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

In [None]:
# fix seeds
torch.manual_seed(13)
random.seed(13)
np.random.seed(13)

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

In [None]:
# mount drive to access data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# move data to working directory and import our library
from zipfile import ZipFile
with ZipFile('drive/MyDrive/Data.zip','r') as zipObj:
  zipObj.extractall('.')

!cp drive/MyDrive/task4_lib.py .

from task4_lib import *

In [None]:
# model definition for pretrain: autoencoder and regressor in one model
# optimize for both reconstruction loss and regression loss
# at the same time to get good compressed feature representation
class MolecularNet(nn.Module):

    def __init__(self):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(1000, 900),
            nn.LeakyReLU(),
            nn.Linear(900, 500),
            nn.LeakyReLU(),
            nn.Linear(500, 20),
        )

        self.decoder = nn.Sequential(
            nn.Linear(20, 500),
            nn.LeakyReLU(),
            nn.Linear(500, 900),
            nn.LeakyReLU(),
            nn.Linear(900, 1000),
        )

        self.regressor = nn.Sequential(
            nn.Linear(20, 1)
        )

        self.loss_fn = nn.MSELoss()

        self.reconstruction_loss = None
        self.regression_loss = None

    def forward(self, x, y):

        # calculate loss for autoencoder

        f = self.encoder(x)
        d = self.decoder(f)

        self.reconstruction_loss = 1.5 * self.loss_fn(x,d)

        # calculate loss for regressor
        
        r = self.regressor(f)

        self.regression_loss = self.loss_fn(r,y)

        # return sum of both losses
        
        return self.reconstruction_loss + self.regression_loss

In [None]:
# get dataloaders for pretrain dataset
# function definition in task4_lib.py
# pretrain_loader and preval_loader split the dataset 80/20
# prefull_loader goes over the whole pretrain dataset
pretrain_loader, preval_loader, prefull_loader = get_loaders(dataset="pretrain")

40000
10000


In [None]:
# model declaration and optimizer
moc = MolecularNet().to(device)
optim = torch.optim.Adam(moc.parameters(), lr=1e-3)

In [None]:
# train loop for the pretrain dataset, definition in task4_lib.py
# keep track of reconstruction and regression loss
# when filling in prefull_loader in for both train_loader and val_loader, then
# training is going over the whole dataset (in this case the validation scores
# are meaningless)
pre_train_loop(moc, pretrain_loader, preval_loader, optim, device, show=1, save=6, epochs=30)

Epoch 0 | Train reconstruction loss: 0.0463 |  Train regression loss: 0.3160 |  Validation reconstruction loss: 0.0371 |  Validation regression loss: 0.1127 |  Duration 4.18 sec
Epoch 1 | Train reconstruction loss: 0.0314 |  Train regression loss: 0.0947 |  Validation reconstruction loss: 0.0269 |  Validation regression loss: 0.0959 |  Duration 5.28 sec
Epoch 2 | Train reconstruction loss: 0.0239 |  Train regression loss: 0.0849 |  Validation reconstruction loss: 0.0219 |  Validation regression loss: 0.0974 |  Duration 4.93 sec
Epoch 3 | Train reconstruction loss: 0.0203 |  Train regression loss: 0.0813 |  Validation reconstruction loss: 0.0193 |  Validation regression loss: 0.0813 |  Duration 4.03 sec
Epoch 4 | Train reconstruction loss: 0.0183 |  Train regression loss: 0.0758 |  Validation reconstruction loss: 0.0178 |  Validation regression loss: 0.0866 |  Duration 4.02 sec
Epoch 5 | Train reconstruction loss: 0.0169 |  Train regression loss: 0.0697 |  Validation reconstruction loss

KeyboardInterrupt: ignored

In [None]:
# model definition for the regression of HOMO-LUMO gap
class GapRegressor(nn.Module):

    def __init__(self):
        super().__init__()

        self.encoder = None

        self.regressor = nn.Sequential(
            nn.Linear(20, 1)
        )

    def forward(self, x):

        x = self.encoder(x)
        x = self.regressor(x)
        
        return x

In [None]:
# load encoder from pretrained model and freeze the weights
combined = torch.load("model_epoch_12.pt")
encoder = combined.encoder
for param in encoder.parameters():
    param.requires_grad = False

# gap regressor declaration, optimizer and loss function
reg = GapRegressor()
reg.encoder = encoder
reg = reg.to(device)

optim = torch.optim.Adam(reg.parameters(), lr=1e-3)

loss_fn = nn.MSELoss()

In [None]:
# get dataloaders for train dataset
# function definition in task4_lib.py
# train_loader and val_loader split the dataset 80/20
# full_loader goes over the whole pretrain dataset
train_loader, val_loader, full_loader = get_loaders(dataset="train",batch_size=10)

80
20


In [None]:
# second train loop for train dataset, definition in task4_lib.py
# keep track of regression loss
# when filling in prefull_loader in for both train_loader and val_loader, then
# training is going over the whole dataset (in this case the validation scores
# are meaningless)
train_loop(reg, full_loader, full_loader, loss_fn, optim, device, show=100, save=5000, epochs=10000)

Epoch 0 | Train loss: 1.1409 |  Validation loss: 1.0306 |  Duration 0.03 sec
Epoch 100 | Train loss: 0.4547 |  Validation loss: 0.4541 |  Duration 0.03 sec
Epoch 200 | Train loss: 0.4049 |  Validation loss: 0.4036 |  Duration 0.02 sec
Epoch 300 | Train loss: 0.3677 |  Validation loss: 0.3674 |  Duration 0.02 sec
Epoch 400 | Train loss: 0.3399 |  Validation loss: 0.3379 |  Duration 0.02 sec
Epoch 500 | Train loss: 0.3148 |  Validation loss: 0.3137 |  Duration 0.02 sec
Epoch 600 | Train loss: 0.2927 |  Validation loss: 0.2924 |  Duration 0.02 sec
Epoch 700 | Train loss: 0.2752 |  Validation loss: 0.2738 |  Duration 0.02 sec
Epoch 800 | Train loss: 0.2581 |  Validation loss: 0.2572 |  Duration 0.03 sec
Epoch 900 | Train loss: 0.2422 |  Validation loss: 0.2414 |  Duration 0.03 sec
Epoch 1000 | Train loss: 0.2284 |  Validation loss: 0.2264 |  Duration 0.02 sec
Epoch 1100 | Train loss: 0.2138 |  Validation loss: 0.2131 |  Duration 0.02 sec
Epoch 1200 | Train loss: 0.2031 |  Validation loss: 

KeyboardInterrupt: ignored

In [None]:
# store model
!cp model_epoch_5000.pt drive/MyDrive/molecular_epoch_5000.pt