<!-- # CNN autoencoder and Clustering from MTRX data

Use this notebook to load Scienta Omicron Matrix format SPM data and create standardised images for machine learning training and analysis. The code can generate both JPG image data, useful for manually checking the data, and windowed numpy data that can be loaded into ML models. 

The notebook then creates an autoencoder for training on a large dataset, followed by KMEANS clustering. 

**Author**: Steven R. Schofield  
**Created**: November, 2024 -->

# CASTEP - Si 001 surface
## Steven R. Schofield (Universtiy College London) May 2025

### Determine appropriate paths whether we are working on macbook or the cluster

In [None]:
import sys
from pathlib import Path

# Define candidate paths using Path objects
module_path_list = [
    Path('/Users/steven/academic-iCloud/Python/modules'),
    Path('/hpc/srs/Python/modules')
]

data_path_list = [
    Path('/Users/steven/academic-iCloud/Calculations/castep/'),
    Path('/hpc/srs/castep')
]

# Resolve actual paths
module_path = next((p for p in module_path_list if p.exists()), None)
data_path = next((p for p in data_path_list if p.exists()), None)

# Check and report missing paths
if module_path is None:
    print("Error: Could not locate a valid module path.")
if data_path is None:
    print("Error: Could not locate a valid data path.")

if module_path is None or data_path is None:
    sys.exit(1)

# Add module_path to sys.path if needed
if str(module_path) not in sys.path:
    sys.path.insert(0, str(module_path))

# Print resolved paths
print(f"module_path = {module_path}")
print(f"data_path = {data_path}")

### Import modules

In [None]:
# # Ensure modules are reloaded 
%load_ext autoreload
%autoreload 2

# Import standard modules
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt

# Import custom module
import SRSCALCUTILS.castep_tools as ct

from IPython.display import display

### Paths for the calculation

In [None]:
# General
job_folder = 'work-2025/si001_surf'
job_path = data_path / job_folder
verbose = False             # Set this True to print out more information

# Print resolved paths
print(f"job_path = {job_path}")

# If it doesn’t exist, create it (including any missing parent folders)
if not os.path.isdir(job_path):
    os.makedirs(job_path, exist_ok=True)

# Now safely switch into it
os.chdir(job_path)

print('Current files in this directory:')
for file in os.listdir():
    print(file)

### Base parameters of calculation and unit cell

In [None]:
# Set up the param and cell files for the calculation

filename = "si001_surf"
title = "silicon 001 surface tests"

params = {'task' : 'geomopt',
          'xc_functional'           : 'PBE',
          'cut_off_energy'          : 400,
          'spin_polarised'          : 'false',
          'write_cell_structure'    : 'true',
          'charge'                  : 0,
          'nextra_bands'            : 8,
          'geom_energy_tol'         : 0.00005,          # default: 0.00005 eV 
          'geom_disp_tol'           : 0.002,            # default: 0.002 Ang.
          'geom_max_iter'           : 100,              # default: 30
          'geom_convergence_win'    : 2,                # default: 2
          'max_scf_cycles'          : 300               # default: 30
          }              

lattice_cart = np.array([
            [3.8641976,     0.0,     0.0],
            [0.0,     3.8641976,     0.0],
            [0.0,     0.0,     5.4648012]
        ])

positions_frac = np.array([
            ['Si', 0.0,  0.0,  0.0],
            ['Si', 0.5,  0.0,  0.25],
            ['Si', 0.5,  0.5,  0.5],
            ['Si', 0.0,  0.5,  0.75],
        ])

cell_constraints = np.array([
            [0,  0,  0],
            [0,  0,  0]
        ])

kpoints_mp_grid = np.array([2, 2, 2])

fix_all_ions = False
symmetry_generate=False
symmetry_tol = 0.01


### Build supercell

In [None]:
n = [2, 2, 1] 
super_positions_frac, super_lattice_cart = ct.create_supercell_from_fractional_coords(positions_frac,lattice_cart,n)

In [None]:
import numpy as np

def select_atoms_by_plane(positions_frac, lattice_cart, axis, ref_value,
                          tolerance=0.0, include=None, exclude=None):
    """
    Select atoms near a specific plane perpendicular to a given axis.

    Parameters
    ----------
    positions_frac : array-like, shape (N, 4)
        Rows of [element_symbol, frac_x, frac_y, frac_z].
    lattice_cart : array-like, shape (3, 3)
        Cartesian lattice vectors (rows are unit cell vectors in Å).
    axis : {'x', 'y', 'z'}
        Axis perpendicular to the plane (e.g. 'z' selects x-y planes).
    ref_value : float or numeric string
        Cartesian coordinate along the given axis in Å defining the plane.
    tolerance : float or numeric string, optional
        Distance tolerance in Å (default=0.0). Atoms with |coord - ref_value| <= tolerance are selected.
    include : list of int, range, slice, or tuple, optional
        Atom indices (1-based) to force include.
    exclude : list of int, range, slice, or tuple, optional
        Atom indices to force exclude.

    Returns
    -------
    result : list of lists, shape (N, 6)
        Each entry: [index (1-based), is_selected (bool), element_symbol,
                     frac_x, frac_y, frac_z].
    """
    # Parse arrays
    arr = np.array(positions_frac, dtype=object)
    symbols = arr[:, 0]
    frac = arr[:, 1:].astype(float)
    lattice = np.array(lattice_cart, dtype=float)

    # Build include/exclude index sets (0-based)
    def build_index_set(spec):
        s = set()
        for item in spec or []:
            if isinstance(item, int):
                s.add(item - 1)
            elif isinstance(item, range):
                s.update(i - 1 for i in item)
            elif isinstance(item, slice):
                start = item.start or 1
                stop = item.stop or len(arr)
                s.update(i for i in range(start - 1, stop))
            elif isinstance(item, tuple) and len(item) == 2:
                start, end = item
                s.update(i - 1 for i in range(start, end + 1))
            else:
                raise ValueError(f"Invalid index specifier: {item}")
        return s

    include_set = build_index_set(include)
    exclude_set = build_index_set(exclude)

    # Convert fractional to Cartesian coordinates
    cart = frac.dot(lattice)

    # Map axis to column
    axis_map = {'x': 0, 'y': 1, 'z': 2}
    try:
        ai = axis_map[axis.lower()]
    except KeyError:
        raise ValueError("Axis must be one of 'x', 'y', or 'z'")

    # Reference and tolerance as floats
    try:
        ref = float(ref_value)
        tol = float(tolerance)
    except Exception:
        raise ValueError("ref_value and tolerance must be numeric or numeric strings.")

    # Compute mask for numeric criteria
    coords = cart[:, ai]
    mask = np.abs(coords - ref) <= tol

    # Build result list
    result = []
    for idx, (atom, fcoords) in enumerate(zip(symbols, frac)):
        if idx in exclude_set:
            sel = False
        elif idx in include_set:
            sel = True
        else:
            sel = bool(mask[idx])
        result.append([idx + 1, sel, atom,
                       float(fcoords[0]), float(fcoords[1]), float(fcoords[2])])

    return result

In [41]:
def find_plane_value(positions_frac, lattice_cart, axis, criteria):
    """
    Determine the plane coordinate along an axis based on criteria.

    Parameters
    ----------
    positions_frac : array-like, shape (N, 4)
        Rows of [element_symbol, frac_x, frac_y, frac_z].
    lattice_cart : array-like, shape (3, 3)
        Cartesian lattice vectors (rows are unit cell vectors in Å).
    axis : {'x', 'y', 'z'}
        Axis perpendicular to the plane.
    criteria : {'minimum', 'maximum', 'centre'}
        'minimum' returns the smallest coordinate, 'maximum' the largest,
        'centre' the midpoint between min and max.

    Returns
    -------
    plane_cart : float
        Cartesian coordinate in Å of the plane.
    plane_frac : float
        Fractional coordinate (0–1) along the axis of the plane.
    """
    arr = np.array(positions_frac, dtype=object)
    frac = arr[:, 1:].astype(float)
    lattice = np.array(lattice_cart, dtype=float)

    # Map axis to index
    axis_map = {'x': 0, 'y': 1, 'z': 2}
    try:
        ai = axis_map[axis.lower()]
    except KeyError:
        raise ValueError("Axis must be one of 'x', 'y', or 'z'")

    # Fractional coordinates along axis
    frac_coords = frac[:, ai]
    if criteria == 'minimum':
        plane_frac = frac_coords.min()
    elif criteria == 'maximum':
        plane_frac = frac_coords.max()
    elif criteria == 'centre':
        plane_frac = (frac_coords.min() + frac_coords.max()) / 2.0
    else:
        raise ValueError("Criteria must be 'minimum', 'maximum', or 'centre'")

    # Convert to Cartesian
    cell_vec = lattice[ai]
    plane_cart = plane_frac * np.linalg.norm(cell_vec)

    return float(plane_cart), float(plane_frac)


(4.0986009, 0.2650277133856722)

In [46]:
criteria = "centre"
axis = 'z'
ref_value, _ = find_plane_value(super_positions_frac, super_lattice_cart, axis, criteria)
tolerance = 0.5

select_atoms_by_plane(super_positions_frac, super_lattice_cart, axis, ref_value,
                          tolerance=tolerance, include=None, exclude=None)

[[1, False, 'Si', 0.5, 0.75, 0.2650277133856722],
 [2, False, 'Si', 0.0, 0.75, 0.2650277133856722],
 [3, False, 'Si', 0.5, 0.25, 0.2650277133856722],
 [4, False, 'Si', 0.0, 0.25, 0.2650277133856722],
 [5, False, 'Si', 0.75, 0.75, 0.17668514225711482],
 [6, False, 'Si', 0.25, 0.75, 0.17668514225711482],
 [7, False, 'Si', 0.75, 0.25, 0.17668514225711482],
 [8, False, 'Si', 0.25, 0.25, 0.17668514225711482],
 [9, False, 'Si', 0.75, 0.5, 0.08834257112855741],
 [10, False, 'Si', 0.25, 0.5, 0.08834257112855741],
 [11, False, 'Si', 0.75, 0.0, 0.08834257112855741],
 [12, False, 'Si', 0.25, 0.0, 0.08834257112855741],
 [13, False, 'Si', 0.5, 0.5, 0.0],
 [14, False, 'Si', 0.0, 0.5, 0.0],
 [15, False, 'Si', 0.5, 0.0, 0.0],
 [16, False, 'Si', 0.0, 0.0, 0.0]]

### Add vacuum spacing

In [None]:
vac = 10
super_positions_frac, super_lattice_cart = ct.create_vacuum_spacing(super_positions_frac, super_lattice_cart, vac)

### Add ionic contraints

In [None]:
# Create fixed ion constraints
conditions = "z < 2.5"
exclude = None
include = None
ionic_constraints = ct.select_atoms_by_region(super_positions_frac, super_lattice_cart, conditions, exclude=exclude, include=include)
print(ionic_constraints)

In [None]:
filename = f"si001_test"
castep_path = job_path / f"{filename}.castep"

print('=' * 80 + f'\nCASTEP file: {castep_path}\n' + '=' * 80 + '\n')   

param_filename = ct.write_param_file(
    params,
    title = title,
    filename=filename,
    path=job_path,
    display_file=True
    )

cell_filename = ct.write_cell_file(
    title = title,
    path=job_path,
    filename=filename,
    lattice_cart=super_lattice_cart,
    positions_frac=super_positions_frac,
    cell_constraints=cell_constraints,
    ionic_constraints=ionic_constraints,
    fix_all_ions=fix_all_ions,
    symmetry_generate=symmetry_generate,
    symmetry_tol = symmetry_tol,
    kpoints_mp_grid=kpoints_mp_grid,
    display_file=True
)

In [11]:
if str(module_path) == '/hpc/srs/Python/modules':
    !bash -l -c "mpirun -np 62 castep.mpi {filename}"
else:
    !zsh -l -c "castepmpi {filename}"


^C
Abort is in progress...hit ctrl-c again to forcibly terminate



In [None]:

# unit_cell, a, b, c, alpha, beta, gamma = ct.get_final_lattice_parameters(castep_path)
# energy_optimisation = ct.get_LBFGS_energies(castep_path)
# energies = [val for _, val in energy_optimisation]
# energy = energies[-1]

# print('Optimised energy {} eV'.format(energy))
# print('Optimised lattice constants a,b = {} Ang., c = {} Ang.'.format(a,c))
# print('Unit cell:')
# for line in unit_cell:
#     print(' '*2,np.abs(line))
