<!-- # CNN autoencoder and Clustering from MTRX data

Use this notebook to load Scienta Omicron Matrix format SPM data and create standardised images for machine learning training and analysis. The code can generate both JPG image data, useful for manually checking the data, and windowed numpy data that can be loaded into ML models. 

The notebook then creates an autoencoder for training on a large dataset, followed by KMEANS clustering. 

**Author**: Steven R. Schofield  
**Created**: November, 2024 -->

# CASTEP output analysis
## Steven R. Schofield (Universtiy College London) May 2025

### Load required modules

In [None]:
import sys
from pathlib import Path

# Define candidate paths using Path objects
module_path_list = [
    Path('/Users/steven/academic-iCloud/Python/modules'),
    Path('/hpc/srs/Python/modules')
]

data_path_list = [
    Path('/Users/steven/academic-iCloud/Calculations/castep/Hydrogen_Bridge/Structure'),
    Path('/hpc/srs/Python-data')
]

# Resolve actual paths
module_path = next((p for p in module_path_list if p.exists()), None)
data_path = next((p for p in data_path_list if p.exists()), None)

# Check and report missing paths
if module_path is None:
    print("Error: Could not locate a valid module path.")
if data_path is None:
    print("Error: Could not locate a valid data path.")

if module_path is None or data_path is None:
    sys.exit(1)

# Add module_path to sys.path if needed
if str(module_path) not in sys.path:
    sys.path.insert(0, str(module_path))

# Print resolved paths
print(f"module_path = {module_path}")
print(f"data_path = {data_path}")

module_path = /Users/steven/academic-iCloud/Python/modules
data_path = /Users/steven/academic-iCloud/Calculations/castep/Hydrogen_Bridge/Structure


In [2]:
# # Ensure modules are reloaded 
%load_ext autoreload
%autoreload 2

# Import standard modules
import numpy as np
import pandas as pd

# Import custom module
import SRSCALCUTILS.castep_tools as ct

from IPython.display import display, Image as StaticImage



### Programme variable setup

In [11]:
# Parameters for windows creation
# General
job_folder = '1x6x8'
job_path = data_path / job_folder
verbose = False             # Set this True to print out more information

# Print resolved paths
print(f"job_path = {job_path}")

job_path = /Users/steven/academic-iCloud/Calculations/castep/Hydrogen_Bridge/Structure/1x6x8


In [10]:
castep_paths = ct.find_all_files_by_extension(job_path,'castep')
#ct.optimisation_summaries(castep_paths)


In [13]:
data_summary = ct.collect_summary_table(job_path)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data_summary)

Unnamed: 0,File,RelPath,nx,ny,nz,kx,ky,kz,Cut-off (eV),Net Charge,Net Spin,Final Enthalpy (eV)
0,si168_Hb_n,neu/hb,1,6,8,8,2,1,750.0000 eV,0.0,1.0,-8479.53533
1,si168_di_n,neu/di,1,6,8,8,2,1,750.0000 eV,0.0,0.0,-8558.88014
2,si168_1di_n,neu/1di,1,6,8,8,2,1,750.0000 eV,0.0,0.0,-8496.75935
3,si168_Ob_n,neu/ob,1,6,8,8,2,1,750.0000 eV,0.0,0.0,-8905.88682
4,si168_Hbb_n,neu/hbb,1,6,8,8,2,1,750.0000 eV,0.0,1.0,-8479.64395
5,si168_clean_n,neu/clean,1,6,8,8,2,1,750.0000 eV,0.0,0.0,-8363.3547
6,si168_int_n,neu/int,1,6,8,8,2,1,750.0000 eV,0.0,1.0,-8479.19578
7,si168_mono_n,neu/mono,1,6,8,8,2,1,750.0000 eV,0.0,0.0,-8464.8638
8,si168_DB_n,neu/db,1,6,8,8,2,1,750.0000 eV,0.0,1.0,-8447.88638
9,si168_3x1_n,neu/3x1,1,6,8,8,2,1,750.0000 eV,0.0,0.0,-8496.83079


task                  : geomopt
xc_functional         : PBE
cut_off_energy        : 750
spin_polarised        : true
write_cell_structure  : true
charge                : 0
max_scf_cycles        : 300


In [18]:
atoms = ct.extract_final_fractional_positions(castep_paths[0])

In [19]:
atoms

[('H', 0.0, 0.090772, 0.417637),
 ('H', 0.0, 0.241662, 0.416745),
 ('H', 0.0, 0.389988, 0.415838),
 ('H', 0.0, 0.610012, 0.415838),
 ('H', 0.0, 0.758338, 0.416745),
 ('H', 0.0, 0.909228, 0.417637),
 ('H', 0.0, 0.5, 0.381127),
 ('Si', 0.0, 0.113965, 0.363497),
 ('Si', 0.0, 0.218002, 0.36282),
 ('Si', 0.0, 0.425661, 0.367508),
 ('Si', 0.0, 0.574339, 0.367508),
 ('Si', 0.0, 0.781998, 0.36282),
 ('Si', 0.0, 0.886035, 0.363497),
 ('Si', 0.5, 0.0877, 0.316442),
 ('Si', 0.5, 0.243942, 0.315732),
 ('Si', 0.5, 0.416585, 0.317238),
 ('Si', 0.5, 0.583415, 0.317238),
 ('Si', 0.5, 0.756058, 0.315732),
 ('Si', 0.5, 0.9123, 0.316442),
 ('Si', 0.5, 0.0, 0.268355),
 ('Si', 0.5, 0.165614, 0.257967),
 ('Si', 0.5, 0.33108, 0.266759),
 ('Si', 0.5, 0.5, 0.263454),
 ('Si', 0.5, 0.66892, 0.266759),
 ('Si', 0.5, 0.834386, 0.257967),
 ('Si', 0.0, 0.0, 0.214144),
 ('Si', 0.0, 0.166034, 0.206877),
 ('Si', 0.0, 0.332189, 0.213084),
 ('Si', 0.0, 0.5, 0.21085),
 ('Si', 0.0, 0.667811, 0.213084),
 ('Si', 0.0, 0.833966