# Generate Approximate Hessians
Use the random displacement data to get the full Hessian matrix. We assume that many parameters of the Hessian matrix are zero, and therefore that we can fit the Hessian matrix with a reduced number of points using [sparse linear regression](https://scikit-learn.org/stable/modules/linear_model.html#lasso)

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from jitterbug.model.linear import HarmonicModel
from jitterbug.model.linear import get_model_inputs
from sklearn.linear_model import LinearRegression, ElasticNetCV
from ase.vibrations import VibrationsData
from ase.db import connect
from pathlib import Path
from tqdm import tqdm
import numpy as np
import warnings

Configuration

In [2]:
molecule_name = 'water'
method = 'hf'
basis = 'def2-svpd'
step_size: float = 0.005 # Perturbation amount, used as maximum L2 norm
regressor = LinearRegression

Derived

In [3]:
run_name = f'{molecule_name}_{method}_{basis}'
out_dir = Path('data') / 'approx'
db_path = out_dir / f'{run_name}-random-d={step_size:.2e}.db'

## Read in the Data
Get all computations for the desired calculation and the exact solution

In [4]:
with connect(db_path) as db:
    data = [a.toatoms() for a in db.select('')]
print(f'Loaded {len(data)} structures')
get_model_inputs(data[1], data[0])

Loaded 46 structures


array([ 1.35768557e+03,  5.13074732e+02, -1.78611307e+03, -3.68960372e+02,
        4.06940665e+02, -5.02709874e+02,  9.81422728e+02, -7.49646372e+02,
        2.31747899e+03,  9.21655058e+05,  6.96594162e+05, -2.42497995e+06,
       -5.00932174e+05,  5.52497470e+05, -6.82521944e+05,  1.33246348e+06,
       -1.01778406e+06,  3.14640779e+06,  1.31622840e+05, -9.16409487e+05,
       -1.89304244e+05,  2.08790973e+05, -2.57927734e+05,  5.03543203e+05,
       -3.84624612e+05,  1.18903991e+06,  1.59509996e+06,  6.59004944e+05,
       -7.26842042e+05,  8.97896679e+05, -1.75293196e+06,  1.33895319e+06,
       -4.13927952e+06,  6.80658780e+04, -1.50144979e+05,  1.85480022e+05,
       -3.62106095e+05,  2.76589804e+05, -8.55057910e+05,  8.28003524e+04,
       -2.04573091e+05,  3.99380817e+05, -3.05061593e+05,  9.43076441e+05,
        1.26358609e+05, -4.93370896e+05,  3.76854634e+05, -1.16501957e+06,
        4.81595285e+05, -7.35719987e+05,  2.27442655e+06,  2.80984842e+05,
       -1.73728972e+06,  

Read in the exact Hessian

In [5]:
with open(f'data/exact/{run_name}-ase.json') as fp:
    exact_vibs = VibrationsData.read(fp)

In [6]:
exact_hess = exact_vibs.get_hessian_2d()
exact_zpe = exact_vibs.get_zero_point_energy()
exact_freqs = exact_vibs.get_frequencies()

## Fit a Hessian with All Data
Fit a model which explains the energy data by fitting a Hessian matrix using compressed sensing (i.e., Lasso).

In [7]:
model = HarmonicModel(reference=data[0], regressor=regressor)
hess_model = model.train(data)

  disp_matrix = np.reciprocal((atoms.positions - reference.positions).flatten())


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
print(f'Trained a model with {len(hess_model.coef_)} terms. {(np.abs(hess_model.coef_) > 1e-7).sum()} are nonzero')

Compare the forces estimated at a zero displacement to the true value

In [None]:
actual_forces = data[0].get_forces()
get_model_inputs(data[1], data[0])
len(data)

In [None]:
pred_forces = -hess_model.coef_[:actual_forces.size].reshape((-1, 3))

In [None]:
print(f'Maximum force: {np.abs(pred_forces).max():.2e} eV/Angstrom')

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(4, 2))

for ax, l, h in zip(axs, ['Actual', 'Estimated'], [actual_forces, pred_forces]):
    ax.matshow(h, vmin=-0.05, vmax=0.05, aspect='auto', cmap='RdBu')

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    
    ax.set_title(l, fontsize=10)

fig.tight_layout()

Get the mean Hessian

In [None]:
approx_hessian = model.mean_hessian(hess_model)

Compare to exact answer

In [None]:
exact_hess[:3, :3]

In [None]:
approx_hessian[:3, :3]

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(4, 2))

for ax, l, h in zip(axs, ['Exact', 'Approximate'], [exact_hess, approx_hessian]):
    ax.matshow(h, vmin=-100, vmax=100, cmap='RdBu')

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    
    ax.set_title(l, fontsize=10)

fig.tight_layout()

Get the zero point energy

In [None]:
approx_vibs = VibrationsData.from_2d(data[0], approx_hessian)

In [None]:
approx_vibs.get_zero_point_energy()

In [None]:
exact_zpe

The two differ, but I'm not sure how important the difference is.

In [None]:
freq_diff = [app_freq.real-ex_freq.real for app_freq, ex_freq in zip(approx_vibs.get_frequencies(), exact_freqs)]


In [None]:
fig, ax = plt.subplots(figsize=(3.5, 2))

ax.plot(exact_freqs, freq_diff)

ax.set_xlabel('Exact frequency')
ax.set_ylabel('Frequency difference (cm-1)')

fig.tight_layout()

## Plot as a Function of Data
See what happens as we add more data to the training

In [None]:
steps = np.linspace(5, len(data), 16, dtype=int)
print(f'Plotting at {len(steps)} steps: {", ".join(map(str, steps[:5]))}, ...')

In [None]:
zpes = []
freqs = []
freq_diffs = []
for count in tqdm(steps):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        hess_model = model.train(data[:count])
    
    approx_hessian = model.mean_hessian(hess_model)
    approx_vibs = VibrationsData.from_2d(data[0], approx_hessian)
    zpes.append(approx_vibs.get_zero_point_energy()*23.060541945329334)
    freqs.append(approx_vibs.get_frequencies())
    freq_diffs.append([app_freq.real-ex_freq.real for app_freq, ex_freq in zip(approx_vibs.get_frequencies(), exact_freqs)])

Plot it

In [None]:
fig, ax = plt.subplots(figsize=(3.5, 2))

ax.plot(steps, zpes)

ax.set_xlim([0, steps.max()])
ax.plot(ax.get_xlim(), [exact_zpe*23.060541945329334]*2, 'k--')

ax.set_xlabel('Energies')
ax.set_ylabel('ZPE (eV)')

fig.tight_layout()

We consistently underestimate the ZPE. Is it because we have too few oscillators?

In [None]:
import matplotlib.pylab as pl

fig, ax = plt.subplots(figsize=(5, 5))
colors = pl.cm.jet(np.linspace(0,1,len(freq_diffs)))

for i, freq_diff in enumerate(freq_diffs):
    ax.plot(exact_freqs, freq_diff, color=colors[i])

ax.set_xlabel('Exact frequency')
ax.set_ylabel('Frequency difference (cm-1)')

fig.tight_layout()