<!-- # CNN autoencoder and Clustering from MTRX data

Use this notebook to load Scienta Omicron Matrix format SPM data and create standardised images for machine learning training and analysis. The code can generate both JPG image data, useful for manually checking the data, and windowed numpy data that can be loaded into ML models. 

The notebook then creates an autoencoder for training on a large dataset, followed by KMEANS clustering. 

**Author**: Steven R. Schofield  
**Created**: November, 2024 -->

# CASTEP - Si 001 surface
## Steven R. Schofield (Universtiy College London) May 2025

### Determine appropriate paths whether we are working on macbook or the cluster

In [44]:
import sys
from pathlib import Path

# Define candidate paths using Path objects
module_path_list = [
    Path('/Users/steven/academic-iCloud/Python/modules'),
    Path('/hpc/srs/Python/modules')
]

data_path_list = [
    Path('/Users/steven/academic-iCloud/Calculations/castep/work-2025/si001_templates'),
    Path('/hpc/srs/castep')
]

# Resolve actual paths
module_path = next((p for p in module_path_list if p.exists()), None)
data_path = next((p for p in data_path_list if p.exists()), None)

# Check and report missing paths
if module_path is None:
    print("Error: Could not locate a valid module path.")
if data_path is None:
    print("Error: Could not locate a valid data path.")

if module_path is None or data_path is None:
    sys.exit(1)

# Add module_path to sys.path if needed
if str(module_path) not in sys.path:
    sys.path.insert(0, str(module_path))

# Print resolved paths
print(f"module_path = {module_path}")
print(f"data_path = {data_path}")

module_path = /Users/steven/academic-iCloud/Python/modules
data_path = /Users/steven/academic-iCloud/Calculations/castep/work-2025/si001_templates


### Import modules

In [45]:
# # Ensure modules are reloaded 
%load_ext autoreload
%autoreload 2

# Import standard modules
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt

# Import custom module
import SRSCALCUTILS.castep_tools as ct

from IPython.display import display

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Paths for the calculation

In [46]:
# General
job_folder = "si001_templates"
filename = job_folder


title = "Job folder: {}. Filename: {}.".format(job_folder, filename)
job_path = data_path / job_folder
verbose = False             # Set this True to print out more information

# Print resolved paths
print(f"job_path = {job_path}")

# If it doesn’t exist, create it (including any missing parent folders)
if not os.path.isdir(job_path):
    os.makedirs(job_path, exist_ok=True)

# Now safely switch into it
os.chdir(job_path)

ct.delete_all_files_in_cwd(force=True)

job_path = /Users/steven/academic-iCloud/Calculations/castep/work-2025/si001_templates/si001_templates
No files found in /Users/steven/Library/Mobile Documents/com~apple~CloudDocs/academic/Calculations/castep/work-2025/si001_templates/si001_templates. Nothing to delete.


### Base parameters of calculation and unit cell

In [47]:
params = {'task' : 'geomopt',
          'xc_functional'           : 'PBE',
          'cut_off_energy'          : 300,
          'spin_polarised'          : 'true',
          'write_cell_structure'    : 'true',
          'charge'                  : 0,
          'nextra_bands'            : 12,
          'geom_energy_tol'         : 0.00005,          # default: 0.00005 eV 
          'geom_disp_tol'           : 0.002,            # default: 0.002 Ang.
          'geom_max_iter'           : 100,              # default: 30
          'geom_convergence_win'    : 2,                # default: 2
          'max_scf_cycles'          : 300               # default: 30
          }              

cell_constraints = np.array([
            [0,  0,  0],
            [0,  0,  0]
        ])

kpoints_mp_grid = np.array([2, 2, 2])

fix_all_ions = False
symmetry_generate=False
symmetry_tol = 0.01

In [48]:
lattice_cart = np.array([
            [3.8641976,     0.0,     0.0],
            [0.0,     7.7283952,     0.0],
            [0.0,     0.0,     5.4648012]
        ])

positions_frac = np.array([   
   ['Si',       0.0000000000,    0.7500000000,    0.7500000000],
   ['Si',       0.0000000000,    0.2500000000,    0.7500000000],
   ['Si',       0.5000000000,    0.7500000000,    0.5000000000],
   ['Si',       0.5000000000,    0.2500000000,    0.5000000000],
   ['Si',       0.5000000000,    0.5000000000,    0.2500000000],
   ['Si',       0.5000000000,    0.0000000000,    0.2500000000],
   ['Si',       0.0000000000,    0.5000000000,    0.0000000000],
   ['Si',       0.0000000000,    0.0000000000,    0.0000000000]
], dtype=object)



### Load positions_frac_surface from template

In [49]:
def read_positions_frac_from_template(
        path=".",
        filename="filename", 
        lattice_cart_bulk=np.array([
            [3.8641976,     0.0,     0.0],
            [0.0,     7.7283952,     0.0],
            [0.0,     0.0,     5.4648012]
        ]),
        positions_frac_bulk = np.array([   
            ['Si',       0.0000000000,    0.7500000000,    0.7500000000],
            ['Si',       0.0000000000,    0.2500000000,    0.7500000000],
            ['Si',       0.5000000000,    0.7500000000,    0.5000000000],
            ['Si',       0.5000000000,    0.2500000000,    0.5000000000],
            ['Si',       0.5000000000,    0.5000000000,    0.2500000000],
            ['Si',       0.5000000000,    0.0000000000,    0.2500000000],
            ['Si',       0.0000000000,    0.5000000000,    0.0000000000],
            ['Si',       0.0000000000,    0.0000000000,    0.0000000000]
            ], dtype=object),
        surface_unit_cell_dims = np.array([1,1,2]), 
        sort_order=['z', 'y', 'x', 'atom']
    ):
    """
    Read positions in fractional coordinates from a template file.
    """
    # Load the template file and make sure it is sorted
    positions_frac_surf, lattice_cart_surf = ct.load_atoms_from_castep(path, filename)
    positions_frac_surf = ct.sort_positions_frac(positions_frac_surf, order=sort_order)

    # Convert template to cartesian coordinates
    positions_cart_surf = ct.frac_to_cart(lattice_cart_surf,positions_frac_surf)

    # Calculate the number of atoms in the surface unit cell to take from the template
    number_of_atoms_surf = positions_frac_bulk.shape[0] * surface_unit_cell_dims[2]

    # Select the correct number of atoms from the template
    positions_cart_surf = positions_cart_surf[:number_of_atoms_surf,:]

    # Adjust the lattice vectors for the surface unit cell (multiply the z-dimension)
    lattice_cart_surf = lattice_cart_bulk.copy()
    lattice_cart_surf[-1, :] *= surface_unit_cell_dims[2]  # Adjust the z-dimension

    # Remove the z-offset from the fractional coordinates
    positions_cart_surf = ct.remove_z_offset(positions_cart_surf)   

    # Convert the selected positions back to fractional coordinates
    positions_frac_surf = ct.cart_to_frac(lattice_cart_surf, positions_cart_surf)

    return positions_frac_surf, lattice_cart_surf

In [50]:
template_path = data_path / 'templates'
template_filename = "si001_clean-out.cell"

positions_frac_surf, lattice_cart_surf = read_positions_frac_from_template(
        path=template_path,
        filename=template_filename,
        lattice_cart_bulk=lattice_cart,
        positions_frac_bulk = positions_frac,
        surface_unit_cell_dims = np.array([1,1,2])
    )

In [51]:
lattice_cart_surf

array([[ 3.8641976,  0.       ,  0.       ],
       [ 0.       ,  7.7283952,  0.       ],
       [ 0.       ,  0.       , 10.9296024]])

In [52]:
positions_cart_surf = ct.frac_to_cart(lattice_cart_surf, positions_frac_surf)
_ = ct.write_xyz(positions_cart_surf, template_path, filename)

Wrote XYZ file to: /Users/steven/academic-iCloud/Calculations/castep/work-2025/si001_templates/templates/si001_templates.xyz


In [68]:
import numpy as np

def create_surface_supercell(
    lattice_cart_bulk: np.ndarray,
    positions_frac_bulk: np.ndarray,
    lattice_cart_surf: np.ndarray,
    positions_frac_surf: np.ndarray,
    n: tuple[int,int,int],
    tol: float = 1e-6
) -> tuple[np.ndarray, np.ndarray]:
    """
    Build an (na x nb x nc) supercell whose bottom 1/nc is the bulk motif
    and whose top (z_multiple/nc) is the surface motif.

    Returns
    -------
    supercell_frac : (M*na*nb, 4) object-array
      Fractional [label, x,y,z] in the supercell.
    lattice_cart_super : (3,3) float-array
    The Cartesian lattice = bulk_lattice * diag(na,nb,nc).
    """
    na, nb, nc = n

    # Check user input 
    if nc < 2:
        raise ValueError("n[2] (z-multiple) must be at least 2 for a surface supercell.")
    
    atoms_bulk = positions_frac_bulk.shape[0]
    atoms_surf = positions_frac_surf.shape[0]

    # if atoms_surf divides evenly by atoms_bulk, ratio is integer
    if atoms_surf % atoms_bulk == 0:
        surfbulkratio = atoms_surf // atoms_bulk
    else:
        raise ValueError("The number of atoms in the surface motif must be a multiple of the number of atoms in the bulk motif.")

    if nc > surfbulkratio:
        n_bulk = nc - surfbulkratio
    elif nc <= surfbulkratio:
        n_bulk = 1
        atoms_surf = atoms_surf - atoms_bulk * (nc - 1)
       
    
      
    # labels_b = positions_frac_bulk[:,0]
    # coords_b = positions_frac_bulk[:,1:].astype(float)
    # labels_s = positions_frac_surf[:,0]
    # coords_s = positions_frac_surf[:,1:].astype(float)


    # Calculate the final unit cell
    lattice_cart_super = lattice_cart.copy()
    lattice_cart_super[0] = lattice_cart_super[0] * na
    lattice_cart_super[1] = lattice_cart_super[1] * nb
    lattice_cart_super[2] = lattice_cart_super[2] * nc

    # blocks = []


    # for i in range(na):
    #     for j in range(nb):
    #         # bulk layer (bottom)
    #         bf = coords_b.copy()
    #         print('bf')
    #         print(bf)
    #         print('endbf ')
    #         bf[:,2] = bf[:,2] / nc
    #         bf = (bf + np.array([i,j,0])) / np.array([na,nb,1])
    #         blk_b = np.empty((len(bf),4), dtype=object)
    #         blk_b[:,0] = labels_b
    #         blk_b[:,1:] = bf
    #         blocks.append(blk_b)

    #         # surface layer (top)
    #         sf = coords_s.copy()
    #         # scale by z_multiple and translate up by (nc - z_multiple)
    #         sf[:,2] = (sf[:,2]*z_multiple + (nc - z_multiple)) / nc
    #         sf = (sf + np.array([i,j,0])) / np.array([na,nb,1])
    #         blk_s = np.empty((len(sf),4), dtype=object)
    #         blk_s[:,0] = labels_s
    #         blk_s[:,1:] = sf
    #         blocks.append(blk_s)

    # print('blocks')
    # print(blocks)
    # print('blpocks end')

    # # 3) concatenate and sort if desired
    # supercell_frac = np.vstack(blocks)

    # # 4) new Cartesian lattice
    # lattice_cart_super = lattice_cart_bulk * np.array(n)[:,None]

    # return supercell_frac, lattice_cart_super
    return lattice_cart_super, positions_frac_surf

n=(1,1,5)
lattice_cart_super, positions_frac_super = create_surface_supercell(
    lattice_cart_bulk=lattice_cart,
    positions_frac_bulk=positions_frac,
    lattice_cart_surf=lattice_cart_surf,
    positions_frac_surf=positions_frac_surf,
    n=n
)

In [67]:
lattice_cart_super

NameError: name 'lattice_cart_super' is not defined

In [56]:
positions_frac

array([['Si', 0.0, 0.75, 0.75],
       ['Si', 0.0, 0.25, 0.75],
       ['Si', 0.5, 0.75, 0.5],
       ['Si', 0.5, 0.25, 0.5],
       ['Si', 0.5, 0.5, 0.25],
       ['Si', 0.5, 0.0, 0.25],
       ['Si', 0.0, 0.5, 0.0],
       ['Si', 0.0, 0.0, 0.0]], dtype=object)

In [56]:
# n = [1,2,1] 
# positions_frac_surf = np.array([
#             ['Si', 0.0,  0.0,  0.0],
#             ['Si', 0.5,  0.0,  0.25],
#             ['Si', 0.5,  0.5,  0.5],
#             ['O', 0.0,  0.7,  0.75],
#         ])
# positions_frac, lattice_cart = create_supercell_from_fractional_coord_with_surf(positions_frac,lattice_cart,n,positions_frac_surf)

### Build supercell

In [57]:
# n = [1,2,3] 
# positions_frac, lattice_cart = ct.create_supercell_from_fractional_coords(positions_frac,lattice_cart,n)
# positions_frac = ct.sort_positions_frac(positions_frac)

### Add hydrogen termination to cell bottom 

In [58]:
# # First select the atoms on the bottom layer 
# criteria = ('min','min','min')
# selected_atom_frac, selected_atom_cart = ct.select_atom_by_conditions(positions_frac, lattice_cart, criteria)
# reference_position = selected_atom_cart[1:]
# plane = (0,0,1)
# labelled_positions_frac = ct.select_atoms_by_plane(
#     positions_frac,
#     lattice_cart,
#     plane,
#     reference_position,
#     tolerance=1
# )

# # Next calculate the relative bond vectors for the two hydrogen atoms (per Si)
# phi = np.deg2rad(90)    
# si_h_bond = 1.5                         
# theta_h1 = np.deg2rad(180) - np.arccos(-1/3) / 2                
# theta_h2 = np.deg2rad(180) + np.arccos(-1/3) / 2            
# bond_h1  = ct.bond_vector_from_spherical(theta_h1, phi, si_h_bond)                   
# bond_h2  = ct.bond_vector_from_spherical(theta_h2, phi, si_h_bond)

# # Now add the hydrogen atoms to the silicon atoms selected above using the calculated bond vectors
# positions_frac, lattice_cart = ct.add_atoms_to_positions_frac(
#                                     labelled_positions_frac,
#                                     lattice_cart,
#                                     [bond_h1,bond_h2],
#                                     extend_unit_cell=(0, 0, 1),
#                                     atom="H")
# positions_frac = ct.sort_positions_frac(positions_frac)

### Dimerise surface (2x1)

In [59]:
# # Find a surface atom to define the surface plane
# criteria = ('min','min','max')
# selected_atom_frac, selected_atom_cart = ct.select_atom_by_conditions(positions_frac, lattice_cart, criteria)
# reference_position = selected_atom_cart[1:]
# plane = (0,0,1)

# # Label the atoms at the 001 surface
# labelled_positions_frac_surf = ct.select_atoms_by_plane(
#     positions_frac,
#     lattice_cart,
#     plane,
#     reference_position,
#     tolerance=1
# )

# # Sublabel the two atoms of each dimer p(2x1)
# labelled_positions_frac_dim1, labelled_positions_frac_dim2 = ct.selected_toggle_plane_selection(
#     labelled_positions_frac_surf, 
#     fast='y', 
#     slow='x', 
#     alternate=False
# )

# # Dimerise surface
# disp_y = 0.734
# disp_z = 0.242
# v1 = (0,-disp_y,-disp_z)
# v2 = (0,disp_y,-disp_z)
# positions_frac = ct.selected_translate(labelled_positions_frac_dim1, lattice_cart, v1, return_labelled=False)

# labelled_positions_frac_dim2 = ct.update_labelled_positions_frac(labelled_positions_frac_dim2,positions_frac)
# positions_frac = ct.selected_translate(labelled_positions_frac_dim2, lattice_cart, v2, return_labelled=False)



### Dimerise surface (add c4x2) buckling)

In [60]:
# # Sublabel the two atoms of each dimer c(4x2)
# labelled_positions_frac_buc1, labelled_positions_frac_buc2 = ct.selected_toggle_plane_selection(
#     labelled_positions_frac_surf, 
#     fast='y', 
#     slow='x', 
#     alternate=True
# )

# disp_y = 0.0
# disp_z = 0.4
# v1 = (0,-disp_y,-disp_z)
# v2 = (0,disp_y,disp_z)

# labelled_positions_frac_buc1 = ct.update_labelled_positions_frac(labelled_positions_frac_buc1,positions_frac)
# positions_frac = ct.selected_translate(labelled_positions_frac_buc1, lattice_cart, v1, return_labelled=False)

# labelled_positions_frac_buc2 = ct.update_labelled_positions_frac(labelled_positions_frac_buc2,positions_frac)
# positions_frac = ct.selected_translate(labelled_positions_frac_buc2, lattice_cart, v2, return_labelled=False)


### Add hydrogen to the top

In [61]:
# # Updated surface labels with current surface positions after dimerisation
# labelled_positions_frac_surf = ct.update_labelled_positions_frac(labelled_positions_frac_surf,positions_frac)
# labelled_positions_frac_dim1 = ct.update_labelled_positions_frac(labelled_positions_frac_dim1,positions_frac)
# labelled_positions_frac_dim2 = ct.update_labelled_positions_frac(labelled_positions_frac_dim2,positions_frac)

# # Next calculate the relative bond vectors for the two hydrogen atoms (per Si)
# phi = np.deg2rad(90)    
# si_h_bond = 1.5                         
# # theta_h1 = np.deg2rad(0) + np.arccos(-1/3) / 2                
# # theta_h2 = np.deg2rad(0) - np.arccos(-1/3) / 2            
# theta = 20 # degrees
# theta_h1 = np.deg2rad(theta)
# theta_h2 = np.deg2rad(360 - theta)
# bond_h1  = ct.bond_vector_from_spherical(theta_h1, phi, si_h_bond)                   
# bond_h2  = ct.bond_vector_from_spherical(theta_h2, phi, si_h_bond)

# # Now add the hydrogen atoms to the silicon atoms selected above using the calculated bond vectors
# positions_frac_h1, lattice_cart = ct.add_atoms_to_positions_frac(
#                                     labelled_positions_frac_dim1,
#                                     lattice_cart,
#                                     bond_h1,
#                                     extend_unit_cell=(0, 0, 1),
#                                     atom="H")

# # Now add the hydrogen atoms to the silicon atoms selected above using the calculated bond vectors
# positions_frac_h2, lattice_cart = ct.add_atoms_to_positions_frac(
#                                     labelled_positions_frac_dim2,
#                                     lattice_cart,
#                                     bond_h2,
#                                     extend_unit_cell=(0, 0, 1),
#                                     atom="H")

# # Merge the two sets of positions_frac for the two sets of surface hydrogen atoms and resort. 
# positions_frac = ct.merge_posfrac_or_labelled_posfrac(positions_frac_h1, positions_frac_h2)
# positions_frac = ct.sort_positions_frac(positions_frac)

### Add vacuum spacing

In [62]:
# vac = 8
# positions_frac, lattice_cart = ct.create_vacuum_spacing(positions_frac, lattice_cart, vac)

### Add ionic contraints

In [63]:
# Create fixed ion constraints
conditions = "z < 2.5"
ionic_constraints = ct.select_atoms_by_region(positions_frac, lattice_cart, conditions)

In [64]:
castep_path = job_path / f"{filename}.castep"

print('=' * 80 + f'\nCASTEP file: {castep_path}\n' + '=' * 80 + '\n')   

param_filename = ct.write_param_file(
    params,
    title = title,
    filename=filename,
    path=job_path,
    display_file=True
    )

cell_filename = ct.write_cell_file(
    title = title,
    path=job_path,
    filename=filename,
    lattice_cart=lattice_cart,
    positions_frac=positions_frac,
    cell_constraints=cell_constraints,
    ionic_constraints=ionic_constraints,
    fix_all_ions=fix_all_ions,
    symmetry_generate=symmetry_generate,
    symmetry_tol = symmetry_tol,
    kpoints_mp_grid=kpoints_mp_grid,
    display_file=True
)

job_filename= ct.write_job_script(
    path=job_path,
    filename=filename,
    wall_time='72:00:00',
    mem_per_slot='5500M',
    threads=4,
    total_slots=192,
    display_file=False
)

# # Convert to cart and write xyz
# positions_cart = ct.frac_to_cart(lattice_cart, positions_frac)
# xyz = ct.write_xyz(positions_cart, path=job_path, filename=filename, comment='Converted by SRSCALCUTILS')


CASTEP file: /Users/steven/academic-iCloud/Calculations/castep/work-2025/si001_templates/si001_templates/si001_templates.castep

Wrote param file to: /Users/steven/academic-iCloud/Calculations/castep/work-2025/si001_templates/si001_templates/si001_templates.param
!TITLE: Job folder: si001_templates. Filename: si001_templates.

TASK                 : geomopt
XC_FUNCTIONAL        : PBE
CUT_OFF_ENERGY       : 300
SPIN_POLARISED       : true
WRITE_CELL_STRUCTURE : true
CHARGE               : 0
NEXTRA_BANDS         : 12
GEOM_ENERGY_TOL      : 5e-05
GEOM_DISP_TOL        : 0.002
GEOM_MAX_ITER        : 100
GEOM_CONVERGENCE_WIN : 2
MAX_SCF_CYCLES       : 300


Wrote cell file to: /Users/steven/academic-iCloud/Calculations/castep/work-2025/si001_templates/si001_templates/si001_templates.cell
! Job folder: si001_templates. Filename: si001_templates.

%BLOCK lattice_cart
   ANG
       3.8641976000    0.0000000000    0.0000000000
       0.0000000000    7.7283952000    0.0000000000
       0.00000000

In [65]:
# if str(module_path) == '/hpc/srs/Python/modules':
#     !bash -l -c "mpirun -np 62 castep.mpi {filename}"
# else:
#     !zsh -l -c "castepmpi {filename}"


In [66]:

# unit_cell, a, b, c, alpha, beta, gamma = ct.get_final_lattice_parameters(castep_path)
# energy_optimisation = ct.get_LBFGS_energies(castep_path)
# energies = [val for _, val in energy_optimisation]
# energy = energies[-1]

# print('Optimised energy {} eV'.format(energy))
# print('Optimised lattice constants a,b = {} Ang., c = {} Ang.'.format(a,c))
# print('Unit cell:')
# for line in unit_cell:
#     print(' '*2,np.abs(line))
