# DeepLDA: Alanine dipeptide and aldol reaction

Reference paper: _Bonati, Rizzi and Parrinello, [JCPL](https://pubs.acs.org/doi/10.1021/acs.jpclett.0c00535) (2020)_ [[arXiv]](https://arxiv.org/abs/2002.06562).

Prerequisite: DeepLDA tutorial.

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/luigibonati/mlcolvar/blob/main/docs/notebooks/examples/ex_DeepLDA.ipynb)

## Setup

In [4]:
# Colab setup
import os

if os.getenv("COLAB_RELEASE_TAG"):
    import subprocess
    subprocess.run('wget https://raw.githubusercontent.com/luigibonati/mlcolvar/main/colab_setup.sh', shell=True)
    cmd = subprocess.run('bash colab_setup.sh EXAMPLE', shell=True, stdout=subprocess.PIPE)
    print(cmd.stdout.decode('utf-8'))

# IMPORT PACKAGES
import torch
import lightning
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import Dataset
from mlcolvar.data import DictDataset

# Set seed for reproducibility
torch.manual_seed(41)

<torch._C.Generator at 0x7f9dbdb791f0>

## Alanine dipeptide

### Load data

We use the alanine dipeptide simulation data from the [PLUMED-MASTERCLASS](https://github.com/luigibonati/masterclass-plumed/) repository.

In [5]:
from mlcolvar.utils.io import create_dataset_from_files, load_dataframe
from mlcolvar.data import DictModule
from mlcolvar.utils.timelagged import create_timelagged_dataset

filenames = [ "https://raw.githubusercontent.com/luigibonati/masterclass-plumed/main/1_DeepLDA/0_unbiased-sA/COLVAR",
              "https://raw.githubusercontent.com/luigibonati/masterclass-plumed/main/1_DeepLDA/0_unbiased-sB/COLVAR" ]
n_states = len(filenames)

dataset, df = create_dataset_from_files(
    filenames,
	filter_args={'regex':'d_' }, # select distances between heavy atoms
	create_labels=True,
	return_dataframe=True,
)


X = df.filter(regex='d_').values
dataset = create_timelagged_dataset(X,lag_time=1)
datamodule = DictModule(dataset,lengths=[0.8,0.2])

Class 0 dataframe shape:  (5001, 53)
Class 1 dataframe shape:  (5001, 53)

 - Loaded dataframe (10002, 53): ['time', 'phi', 'psi', 'theta', 'xi', 'ene', 'd_2_5', 'd_2_6', 'd_2_7', 'd_2_9', 'd_2_11', 'd_2_15', 'd_2_16', 'd_2_17', 'd_2_19', 'd_5_6', 'd_5_7', 'd_5_9', 'd_5_11', 'd_5_15', 'd_5_16', 'd_5_17', 'd_5_19', 'd_6_7', 'd_6_9', 'd_6_11', 'd_6_15', 'd_6_16', 'd_6_17', 'd_6_19', 'd_7_9', 'd_7_11', 'd_7_15', 'd_7_16', 'd_7_17', 'd_7_19', 'd_9_11', 'd_9_15', 'd_9_16', 'd_9_17', 'd_9_19', 'd_11_15', 'd_11_16', 'd_11_17', 'd_11_19', 'd_15_16', 'd_15_17', 'd_15_19', 'd_16_17', 'd_16_19', 'd_17_19', 'walker', 'labels']
 - Descriptors (10002, 45): ['d_2_5', 'd_2_6', 'd_2_7', 'd_2_9', 'd_2_11', 'd_2_15', 'd_2_16', 'd_2_17', 'd_2_19', 'd_5_6', 'd_5_7', 'd_5_9', 'd_5_11', 'd_5_15', 'd_5_16', 'd_5_17', 'd_5_19', 'd_6_7', 'd_6_9', 'd_6_11', 'd_6_15', 'd_6_16', 'd_6_17', 'd_6_19', 'd_7_9', 'd_7_11', 'd_7_15', 'd_7_16', 'd_7_17', 'd_7_19', 'd_9_11', 'd_9_15', 'd_9_16', 'd_9_17', 'd_9_19', 'd_11_15



In [2]:
print(dataset)
print(dataset["weights"])
print(dataset["weights_lag"])

DictDataset( "data": [9998, 45], "data_lag": [9998, 45], "weights": [9998], "weights_lag": [9998] )
tensor([1., 1., 1.,  ..., 1., 1., 1.])
tensor([1., 1., 1.,  ..., 1., 1., 1.])


### Train DeepTICA

In [6]:
from mlcolvar.cvs import DeepTICA

n_components = 1
nn_layers = [45, 30, 30, 3]
options= {'nn': {'activation': 'tanh'} }

model = DeepTICA(nn_layers, n_cvs=n_components, options=options)

model

DeepTICA(
  (loss_fn): ReduceEigenvaluesLoss()
  (norm_in): Normalization(in_features=45, out_features=45, mode=mean_std)
  (nn): FeedForward(
    (nn): Sequential(
      (0): Linear(in_features=45, out_features=30, bias=True)
      (1): Tanh()
      (2): Linear(in_features=30, out_features=30, bias=True)
      (3): Tanh()
      (4): Linear(in_features=30, out_features=3, bias=True)
    )
  )
  (tica): TICA(in_features=3, out_features=1)
)

In [7]:
class CL_dataset(Dataset):
    def __init__(
        self,
        data_list,
        data_augmented_list,
        data_augmented_hard_list,
        temperature_list,
    ):
        super(CL_dataset, self).__init__()
        self.x = data_list
        self.x_augmented = data_augmented_list
        self.x_augmented_hard = data_augmented_hard_list
        self.temperature = temperature_list
        
    def __getitem__(self, index):
	    return self.x[index], self.x_augmented[index], self.x_augmented_hard[index], self.temperature[index]
 
    def __len__(self):
	    return self.x.shape[0]
 

In [9]:
custom_dataset = torch.load("../../data/dataset/alanine/300.0/v3/cl-distance.pt")
custom_weights = torch.tensor([1], dtype=torch.float32).repeat(custom_dataset.x.shape[0])
custom_weights_lag = torch.tensor([1], dtype=torch.float32).repeat(custom_dataset.x.shape[0])

In [10]:
new_dataset = DictDataset({
	"data": custom_dataset.x,
	"data_lag": custom_dataset.x_augmented,
 	"weights": custom_weights,
 	"weights_lag": custom_weights_lag,
})
datamodule = DictModule(new_dataset,lengths=[0.8,0.2])

# Train DeepTICA

Define trainer and fit

In [11]:
datamodule

DictModule(dataset -> DictDataset( "data": [400000, 45], "data_lag": [400000, 45], "weights": [400000], "weights_lag": [400000] ),
		     train_loader -> DictLoader(length=0.8, batch_size=0, shuffle=True),
		     valid_loader -> DictLoader(length=0.2, batch_size=0, shuffle=True))

In [None]:
import lightning

from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from mlcolvar.utils.trainer import MetricsCallback

# define callbacks
metrics = MetricsCallback()
early_stopping = EarlyStopping(
    monitor="valid_loss",
    min_delta=1e-5,
    patience=20
)


# define trainer
trainer = lightning.Trainer(
    callbacks=[metrics, early_stopping],
	max_epochs=None,
 	logger=None,
  	enable_checkpointing=False
)

# fit
trainer.fit( model, datamodule )

In [13]:
torch.save(model.state_dict(), 'deeptica-v4.pt')

v1: original dataset

v2: ??v

v3: hard augmented as time lag

v4: augmented as time lag, POSITIVE_SAMPLES