In [8]:
# from engjax_fuctions import *
import jax.numpy as jnp
from jax.scipy.linalg import eigh
from jax import grad, jacobian
import MDAnalysis as mda
import re
import jax


In [9]:


# Extract LJ parameters from topology file
def extract_lj_parameters(path):
    with open(path, 'r') as file:
        data = file.read()
    lj_sr_pattern = r"type=(\S+).*?c6=\s*([0-9\.\-eE+]+).*?c12=\s*([0-9\.\-eE+]+)"
    lj_sr_matches = re.findall(lj_sr_pattern, data)
    lj_params = {atom_type: {"c6": float(c6), "c12": float(c12)} for atom_type, c6, c12 in lj_sr_matches}
    return lj_params

In [10]:
tpr_file = "/Users/sss/Documents/EnergyGap_project/ENGgromax_output_files/md.tpr"
xtc_file = "/Users/sss/Documents/EnergyGap_project/ENGgromax_output_files/md.xtc"
lj_contents = "/Users/sss/Documents/EnergyGap_project/ENGgromax_output_files/tpr_contents.txt"
lj_params = extract_lj_parameters(lj_contents)

In [11]:
u = mda.Universe(tpr_file, xtc_file )
print(f"{u}, unlike in the paper it reports 39768 atoms\n  However the protein atoms is {len(u.select_atoms('protein'))} same as the paper" )
time_step = u.trajectory.dt 
print(f"The time step for the trajectories is : {time_step} ps" )
print(f"The system has {u.trajectory.n_frames} frames")
print(f"The total simulation time is 81 x 5 = {len(u.trajectory)*u.trajectory.dt} ps\n   The paper reports saving a frame every 400ps which mean in this work we can only reproduce the first frame , equivilant to frame number 200.\n   However to test the code, more frames are needed, therefore, a frame every 40ps will be selected. ")
protein = u.select_atoms("protein")

protein_residues = protein.residues
print(f"The system has {len(protein_residues)} residues")
n_residues = len(protein_residues)
for residue in protein_residues:
    print(f"{ residue.resid}, {residue.resname}")

<Universe with 8339 atoms>, unlike in the paper it reports 39768 atoms
  However the protein atoms is 304 same as the paper
The time step for the trajectories is : 5.0 ps
The system has 81 frames
The total simulation time is 81 x 5 = 405.0 ps
   The paper reports saving a frame every 400ps which mean in this work we can only reproduce the first frame , equivilant to frame number 200.
   However to test the code, more frames are needed, therefore, a frame every 40ps will be selected. 
The system has 20 residues
1, ASN
2, LEU
3, TYR
4, ILE
5, GLN
6, TRP
7, LEU
8, LYS
9, ASP
10, GLY
11, GLY
12, PRO
13, SER
14, SER
15, GLY
16, ARG
17, PRO
18, PRO
19, PRO
20, SER


In [14]:


# Compute the distance matrix using residue centers of mass
def compute_distance_matrix(coords, residues):
    start_idx = 0
    coms = [jnp.mean(coords[start_idx: start_idx + len(residue.atoms)], axis=0)
            for residue in residues]
    coms = jnp.array(coms)
    distance_matrix = jax.vmap(lambda x: jax.vmap(lambda y: jnp.linalg.norm(x - y))(coms))(coms)
    return distance_matrix

# Compute interaction energy matrix
def compute_interaction_energy_matrix(distance_matrix, residues, lj_params):
    n_residues = len(residues)
    energy_matrix = jnp.zeros((n_residues, n_residues))
    conversion_factor = 138.935485

    for i in range(n_residues):
        for j in range(i + 1, n_residues):
            E_vdw = 0.0
            E_coul = 0.0
            for atom_i in residues[i].atoms:
                for atom_j in residues[j].atoms:
                    r = jnp.linalg.norm(atom_i.position - atom_j.position)
                    if r > 0:
                        type_i = atom_i.type
                        type_j = atom_j.type
                        if type_i in lj_params and type_j in lj_params:
                            c6_i = lj_params[type_i]["c6"]
                            c12_i = lj_params[type_i]["c12"]
                            c6_j = lj_params[type_j]["c6"]
                            c12_j = lj_params[type_j]["c12"]

                            sigma_i = (c12_i / c6_i)**(1/6)
                            epsilon_i = c6_i**2 / (4 * c12_i)
                            sigma_j = (c12_j / c6_j)**(1/6)
                            epsilon_j = c6_j**2 / (4 * c12_j)

                            sigma_ij = (sigma_i + sigma_j) / 2
                            epsilon_ij = jnp.sqrt(epsilon_i * epsilon_j)

                            E_vdw += 4 * epsilon_ij * ((sigma_ij / r)**12 - (sigma_ij / r)**6)

                        E_coul += conversion_factor * (atom_i.charge * atom_j.charge) / r

            total_energy = E_vdw + E_coul
            energy_matrix = energy_matrix.at[i, j].set(total_energy)
            energy_matrix = energy_matrix.at[j, i].set(total_energy)

    return energy_matrix

# Compute eigenvalues and eigenvectors
def compute_eigen(energy_matrix):
    eigenvalues, eigenvectors = eigh(energy_matrix)
    return eigenvalues, eigenvectors

# Compute ENG and SDENG from eigenvalues
def compute_eng_sdeng(eigenvalues):
    spectral_gap = eigenvalues[1] - eigenvalues[0]
    avg_separation = jnp.mean(jnp.diff(eigenvalues))
    eng_t = spectral_gap / avg_separation if avg_separation > 0 else 0.0
    sdeng_t = jnp.std(eigenvalues)
    return eng_t, sdeng_t

# Compute the CV
def compute_cv(eng_t, sdeng_t, alpha, beta):
    return alpha * eng_t - beta * sdeng_t

# Modulate weights based on probabilities
def modulate_weights_with_probability(eng_time_series, sdeng_time_series, frame_index, alpha, beta, percentages):
    eng_max = jnp.max(eng_time_series)
    sdeng_min = jnp.min(sdeng_time_series)
    msd = jnp.std(eng_time_series)

    n = percentages[frame_index % len(percentages)]
    eng_threshold = eng_max - (n / 100) * msd
    sdeng_threshold = sdeng_min + (n / 100) * msd

    eng_prob = eng_time_series[frame_index] > eng_threshold
    sdeng_prob = sdeng_time_series[frame_index] < sdeng_threshold

    if eng_prob:
        alpha *= 1.1
    if sdeng_prob:
        beta *= 0.9

    return alpha, beta

# Compute CV gradient with respect to coordinates
def compute_cv_gradient(coords, protein_residues, lj_params, alpha, beta):
    def cv_function(coords):
        distance_matrix = compute_distance_matrix(coords, protein_residues)
        energy_matrix = compute_interaction_energy_matrix(distance_matrix, protein_residues, lj_params)
        eigenvalues, _ = compute_eigen(energy_matrix)
        eng_t, sdeng_t = compute_eng_sdeng(eigenvalues)
        return compute_cv(eng_t, sdeng_t, alpha, beta)

    gradient = grad(cv_function)(coords)
    return gradient


time_step = u.trajectory.dt
selected_frames = range(0, u.trajectory.n_frames, int(40 / time_step))
percentages = list(range(0, 101, 1))

alpha, beta = 1.0, 1.0
eng_time_series = jnp.zeros(len(selected_frames))
sdeng_time_series = jnp.zeros(len(selected_frames))

for frame_idx, frame in enumerate(selected_frames):
    u.trajectory[frame]
    coords = jnp.array([atom.position for atom in protein.atoms])
    distance_matrix = compute_distance_matrix(coords, protein_residues)
    energy_matrix = compute_interaction_energy_matrix(distance_matrix, protein_residues, lj_params)
    eigenvalues, _ = compute_eigen(energy_matrix)
    eng_t, sdeng_t = compute_eng_sdeng(eigenvalues)

    eng_time_series = eng_time_series.at[frame_idx].set(eng_t)
    sdeng_time_series = sdeng_time_series.at[frame_idx].set(sdeng_t)
    cv_t = compute_cv(eng_t, sdeng_t, alpha, beta)

    gradient = compute_cv_gradient(coords, protein_residues, lj_params, alpha, beta)

    print(f"Frame {frame_idx}: ENG(T): {eng_t}, SDENG(T): {sdeng_t}, CV(T): {cv_t}, Gradient Norm: {jnp.linalg.norm(gradient)}")  #Gradient: {gradient}

    alpha, beta = modulate_weights_with_probability(
        eng_time_series, sdeng_time_series, frame_idx, alpha, beta, percentages
    )
    


Frame 0: ENG(T): 2.725583076477051, SDENG(T): 17.3066463470459, CV(T): -14.581063270568848, Gradient Norm: 0.0


KeyboardInterrupt: 

### Validation and Debugging

Next Steps:  

Folding Probability & Coarse Tuning Loop:
We need to compute the folding probability for each frame. 
This involves comparing the computed ENG(t) and SDENG(t) values to dynamic thresholds based on percentages.
Fine Tuning Loop:
Use the correlation distance (CD) to refine the parameters 
α and β, controlling ENG(t) and SDENG(t). 
This requires:  
Residue time series contributions from the eigenvector of the smallest eigenvalue.  

Computing correlation coefficients and distances between the time series of ENG(t) and residue contributions.  





### Coarse tuning Loop  
Inputs  

1. ENG(t) time series  
2. SDENG(t) time series  
3. Percentages 

Steps  

1. Compute thresholds  
2. Count frames satisfying the threshold, above eng and below sdeng  
3. calculate Pfold for each percentage  

Outputs  

1. Folded Probabilities for each percentage.  
2. Counts of frames satisfying the thresholds


# Fine Tuning Loop

1. Extract Residue Contributions  
Iterate over the frames, sum the eigenvector contributions per residue, and build the residue time series.
2. Calculate Correlation Coefficients  
Compute the correlation of each residue's time series with ENG(t).
3. Calculate Correlation Distance (CD)  
Use the formula above to compute CD for each residue.
4. Debug and Output the Results  
Verify that CD values make sense and are in the expected range [0, 2].