# File to test gap_modules - simon 

## Importing modules and classes

In [None]:
# Using the model with the simplest descriptor, 2b - two body distance
from gap_models2 import GAPModel
from gap_models2 import distance_2b

## Creating instances of models and descriptors

In [None]:
# Initialize distance_2_b descriptor
descriptor_2b = distance_2b(4,'ard_se',0.5,1.0,'uniform','T',10)

In [None]:
# Make instances of GAPModel
model = GAPModel()


## Training the model

In [None]:
# Getting the parameters
d2b_parameters = descriptor_2b.get_parameter_string()


# Choose training data
t_data = "/Users/simon/simon_ml/GAP/hydrogen_md.xyz"

# Train the model
model.train(d2b_parameters,training_data=t_data,GAP_potential='testGAP.xml')

# Print out file in which the potential is stored
model.get_potential_file()





## Making predictions 

In [None]:
# Select which configuration you want to predict the energy for 
test_data = "/Users/simon/simon_ml/GAP/hydrogen_md.xyz"


# Get trained potential
GAP_potential = model.get_potential_file()


# Select filename in which predictions should be stored 
output = 'quip_prediction.xyz'


# Use predict method 
model.predict(Test_Data=test_data, GAP_potential=GAP_potential, QUIP_Prediction=output)



In [None]:
model.get_prediction_file()

## Plot real values against predicted values to see quality of the fit

In [None]:
import ase.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

# Read data 
train_atoms = ase.io.read(t_data, ':')
test_atoms = ase.io.read(model.get_prediction_file(), ':')

# Plot real energies on the x axis and predicted energies on the y axis 
train_energy = [at.get_potential_energy()  for at in train_atoms]
predicted_energy = [at.get_potential_energy()  for at in test_atoms]

energy_plot = matplotlib.pyplot.scatter(train_energy[:-1],predicted_energy[:-1],)



In [None]:
# Plot real forces on the x axis and predicted forces on the y axis 
train_atoms = ase.io.read(t_data, ':')
test_atoms = ase.io.read(model.get_prediction_file(), ':')

# extract data for only one species
train_force, test_force = [], []
for at_train, at_test in zip(train_atoms, test_atoms):
    # get the symbols
    sym_all = at_train.get_chemical_symbols()
    # add force for each atom
    for j, sym in enumerate(sym_all):
        train_force.append(at_train.get_forces()[j])
        #out_force.append(at_out.get_forces()[j]) \  
        test_force.append(at_test.arrays['force'][j])

force_plot = matplotlib.pyplot.scatter(train_force[:],test_force[:])


## Splitting up the Data into training data and test data


In [None]:
from gap_models2 import Split

In [None]:
# Create split object and pass in the data that needs to be split and the percentage of the training data 
data_file = "/Users/simon/simon_ml/GAP/hydrogen_md.xyz"
train_percentage = 0.8

split = Split(data_file, train_percentage)

In [None]:
# Get percentages 
split.get_percentages()

In [None]:
# Split up the data. Here the output_file_names need to be passed in order to avoid overwriting later. 
train_data_file = 'train_data_file.xyz'
test_data_file = 'test_data_file.xyz'
split.split(train_data_file,test_data_file)

In [None]:
# Get the splitted 
train,test = split.get_splitted_data()

In [None]:
# Get the names of the data files containing the splitted data. These are needed for the model.train and 
# model.predict methods. 
train_data_file, test_data_file = split.get_splitted_data_files()