## Check pySPM version

In [1]:
import pySPM
print(pySPM.__version__)

0.6.2


In [None]:
from spmpy import SPMFile
spm_file = SPMFile('C:/Users/cobia/OneDrive - University of Cambridge/HF_Database/AFM/raw/short_end_01.0_00000.spm')
spm_file.header

In [1]:
import json
import numpy as np
from pint import Quantity
from spmpy.ciaoparams import ScaleParameter, ValueParameter, SelectParameter
from spmpy import SPMFile
import os

def custom_serializer(obj):
    """
    Convert problematic objects into JSON-serializable representations.
    """

    # --- NumPy arrays ---
    if isinstance(obj, np.ndarray):
        return obj.tolist()

    # --- Pint Quantity ---
    if isinstance(obj, Quantity):
        return {"value": obj.magnitude, "unit": str(obj.units)}

    # --- spmpy parameters ---
    if isinstance(obj, (ScaleParameter, ValueParameter, SelectParameter)):
        # Try to capture core fields
        d = {"_type": obj.__class__.__name__}
        if hasattr(obj, "value"):
            d["value"] = obj.value
        if hasattr(obj, "unit"):
            d["unit"] = str(obj.unit)
        if hasattr(obj, "choices"):  # for SelectParameter
            d["choices"] = obj.choices
        return d

    # --- Fallback for other objects ---
    try:
        return str(obj)
    except Exception:
        return f"<<unserializable: {type(obj).__name__}>>"

# Define the folder containing .spm files
folder_path = r"C:\Users\cobia\OneDrive - University of Cambridge\HF_Database\AFM\raw"
output_path = r"C:\Users\cobia\OneDrive - University of Cambridge\HF_Database\AFM\metadata"

spm_files = [f for f in os.listdir(folder_path) if f.endswith('.spm')]

for spm_filename in spm_files:
    basename = os.path.splitext(spm_filename)[0]
    spm_file = SPMFile(os.path.join(folder_path, spm_filename))
    json_str = json.dumps(spm_file.header, default=custom_serializer, indent=2)

    # Save to file
    with open(os.path.join(output_path, basename +'.json'), "w") as f:
        json.dump(spm_file.header, f, default=custom_serializer, indent=2)




## Get list of SPM files in folder

In [None]:
import os

# Define the folder containing .spm files
folder_path = r"C:\Users\cobia\OneDrive - University of Cambridge\HF_Database\AFM\raw"

spm_files = [f for f in os.listdir(folder_path) if f.endswith('.spm')]

## Import all .spm files and preprocess by (Skip if data already corrected and using Lumispy kernel):
- Align rows by median of differences
- Filter: scar removal
- Level data by plane subtraction

In [None]:
import pySPM
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import math
%matplotlib inline

import os
from IPython import display

from skimage.morphology import binary_erosion, disk

import copy

import imageio.v2 as imageio
from skimage import exposure



# Define the folder containing .spm files
folder_path = r"C:\Users\cobia\OneDrive - University of Cambridge\HF_Database\AFM\raw"
output_path = r"C:\Users\cobia\OneDrive - University of Cambridge\HF_Database\AFM\processed"

spm_files = [f for f in os.listdir(folder_path) if f.endswith('.spm')]

fig, ax = plt.subplots(math.ceil(len(spm_files)/3), 3, figsize=(10, 4*math.ceil(len(spm_files)/3)))

# Loop through each file
for idx, filename in enumerate(spm_files):
    filepath = os.path.join(folder_path, filename)

    if os.path.exists(os.path.join(output_path, f"{filename}_corrected.csv")):
        print(f"Processed file already exists for {filename}")
        topoD = np.loadtxt(os.path.join(output_path, f"{filename}_corrected.csv"), delimiter=',')
        ax[int(idx/3), idx%3].imshow(topoD, cmap='afmhot')
        ax[int(idx/3), idx%3].set_title(filename)
        continue
    else:
        print(f"Processing file: {filename}")    
    
    ScanB = pySPM.Bruker(filepath)

    # Try to get the first data channel (usually '0/data')

    data_channel = ScanB.get_channel("Height")

    # topo2 = data_channel.correct_plane(inline=False)

    topo2 = copy.deepcopy(data_channel)
    # topo2.correct_median_diff()

    pixels = topo2.pixels.copy()
    for row in range(pixels.shape[0]):
        p = np.polyfit(np.arange(pixels.shape[1]), pixels[row, :], deg=2)
        pixels[row, :] -= np.polyval(p, np.arange(pixels.shape[1]))

    topo2.pixels = pixels

    topoD = topo2.filter_scars_removal(.7, inline=False)

    # # Correct the plane and apply filtering
    # topoD = topo3.corr_fit2d(inline=False)
    # topoD = topoD.filter_scars_removal()

    topoD.show(ax=ax[int(idx/3), idx%3], cmap='afmhot', title=filename)

    np.save(os.path.join(output_path, f"{filename}_corrected.npy"), topoD.pixels)

    # Scale AFM data to 0–255 and convert to uint8
    # img_scaled = exposure.rescale_intensity(
    #     topoD.pixels, out_range=(0, 255)
    # ).astype(np.uint8)

    # # Save as grayscale PNG
    # imageio.imwrite(
    #     os.path.join(folder_path, f"{filename}_corrected.png"),
    #     img_scaled
    # )

    # # Extract the height data
    # Z = data_channel

    # # Plot the height map
    # plt.figure(figsize=(6, 5))
    # plt.imshow(Z, cmap='afmhot', origin='lower')
    # plt.colorbar(label='Height')
    # plt.title(f'Height map: {filename}')
    # plt.xlabel('X')
    # plt.ylabel('Y')
    # plt.tight_layout()
    # plt.show()


plt.show()

## Import manual labels for each AFM scan and visualise

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Patch
from math import sqrt
import os
import math

# Prepare figure layout
n_images = len(spm_files)
n_cols = 3
n_rows = math.ceil(n_images / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 14 / n_cols * n_rows))
axes = axes.flatten() if n_images > 1 else [axes]

for i, filename in enumerate(spm_files):
    afm_file = os.path.join(folder_path, f"{filename}_corrected.csv")
    label_file = os.path.join(folder_path, f"{os.path.splitext(filename)[0]}_categorised.csv")

    afm_data = pd.read_csv(afm_file, header=None).values
    labels = pd.read_csv(label_file)

    ax = axes[i]
    im = ax.imshow(afm_data, cmap='afmhot', vmin=-7, vmax=3)
    ax.set_title(filename, fontsize=10)
    ax.axis('off')

    # Add colorbar to each subplot
    cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    cbar.set_label('Height (nm)', fontsize=8)

    # Calculate average size for large and small categories
    avg_large = labels[labels['category'] == 'large']['size'].mean()
    avg_small = labels[labels['category'] == 'small']['size'].mean()

    # Add labeled circles
    for _, row in labels.iterrows():
        if row['category'] not in ['large', 'small']:
            continue
        color = 'red' if row['category'] == 'large' else 'orange'
        if pd.isna(row['size']):
            if row['category'] == 'large':
                radius = sqrt(avg_large)
            else:
                radius = sqrt(avg_small)
        else:
            radius = sqrt(row['size'])
        circle = Circle((row['x'], afm_data.shape[0] - 1 - row['y']), radius, color=color, fill=False, linewidth=1.5)
        ax.add_patch(circle)

# Hide unused axes
for j in range(i + 1, len(axes)):
    axes[j].axis('off')

# Add global legend
legend_elements = [
    Patch(edgecolor='red', facecolor='none', label='Large', linewidth=2),
    Patch(edgecolor='orange', facecolor='none', label='Small', linewidth=2)
]
fig.legend(handles=legend_elements, loc='upper center', ncol=2, frameon=False)

plt.tight_layout(rect=[0, 0, 1, 0.99])  # space for legend
plt.show()


In [None]:
plt.close('all')

## Display AFM data without manual labels

In [None]:
# Prepare figure layout
n_images = len(spm_files)
n_cols = 3
n_rows = math.ceil(n_images / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 14 / n_cols * n_rows))
axes = axes.flatten() if n_images > 1 else [axes]

for i, filename in enumerate(spm_files):
    afm_file = os.path.join(folder_path, f"{filename}_corrected.csv")
    label_file = os.path.join(folder_path, f"{os.path.splitext(filename)[0]}_categorised.csv")

    afm_data = pd.read_csv(afm_file, header=None).values
    labels = pd.read_csv(label_file)

    ax = axes[i]
    im = ax.imshow(afm_data, cmap='afmhot', vmin=-7, vmax=3)
    ax.set_title(filename, fontsize=10)
    ax.axis('off')

plt.show()

## Use local tresholding to find spots and visualise:

In [None]:
import os
import math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from skimage import filters, measure

# Prepare figure layout
n_images = len(spm_files)
n_cols = 3
n_rows = math.ceil(n_images / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(10, 10 / n_cols * n_rows))
axes = axes.flatten() if n_images > 1 else [axes]

for i, filename in enumerate(spm_files):
    afm_file = os.path.join(folder_path, f"{filename}_corrected.csv")
    label_file = os.path.join(folder_path, f"{os.path.splitext(filename)[0]}_categorised.csv")

    # Load AFM height data (in nm)
    afm_data = pd.read_csv(afm_file, header=None).values

    # Perform local thresholding
    block_size = 19  # px
    # offset_nm = 1  # nm
    offset_nm = best_offsets[i]  # Use the best offset found earlier
    local_thresh = filters.threshold_local(afm_data, block_size=block_size, offset=offset_nm)
    dark_spots = afm_data < local_thresh

    # Find contours of dark spots
    contours = measure.find_contours(dark_spots, level=0.5)

    ax = axes[i]
    im = ax.imshow(afm_data, cmap='afmhot', vmin=-7, vmax=3)
    ax.set_title(filename, fontsize=10)
    ax.axis('off')

    # Draw contours
    for contour in contours:
        ax.plot(contour[:, 1], contour[:, 0], color='blue', linewidth=0.8)
    
plt.tight_layout()
plt.show()


## Plot thresholding offset vs number of spots for each file


In [None]:
from scipy import ndimage as ndi

plt.close('all')

# Prepare figure layout
n_images = len(spm_files)
n_cols = 3
n_rows = math.ceil(n_images / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 14 / n_cols * n_rows))
axes = axes.flatten() if n_images > 1 else [axes]


for i, filename in enumerate(spm_files):
    afm_file = os.path.join(folder_path, f"{filename}_corrected.csv")
    label_file = os.path.join(folder_path, f"{os.path.splitext(filename)[0]}_categorised.csv")

    label_data = pd.read_csv(label_file)

    if i == 0:
        all_labels = label_data
    else:
        all_labels = pd.concat([all_labels, label_data], ignore_index=True)

    num_labels = len(label_data)

    # Load AFM height data (in nm)
    afm_data = pd.read_csv(afm_file, header=None).values

    # Perform local thresholding
    block_size = 19  # px

    offsets = [x/10 for x in list(range(1, 30))]  # Offsets from 0.1 to 3.0 nm

    num_spots_list = []

    for offset_nm in offsets:
        local_thresh = filters.threshold_local(afm_data, block_size=block_size, offset=offset_nm)
        dark_spots = afm_data < local_thresh
        # Count the number of dark spots
        labeled_spots, num_spots = ndi.label(dark_spots)

        # Calculate the sizes of all labeled spots
        spot_sizes = np.bincount(labeled_spots.flat)[1:]

        # Count spots that are between 6 and 199 pixels (inclusive)
        num_spots = np.sum((spot_sizes >= 6) & (spot_sizes <= 199))

        num_spots = min(num_spots,200)
        num_spots_list.append(num_spots)

    axes[i].plot(offsets, num_spots_list, label=filename)
    axes[i].hlines(y=num_labels, xmin=0, xmax=offsets[-1], linestyles='dashed', colors='red', label=f'Labels: {num_labels}')
    axes[i].set_title(filename, fontsize=10)


plt.show()

## Automatic Offset Detection

In [None]:
from scipy import ndimage as ndi
from skimage import filters
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

from scipy.signal import find_peaks

def find_best_offset_first_derivative_peak(offsets, num_spots_list, peak_prominence=1.0, drop_threshold=20):
    """
    Finds the best offset based on the first local maximum in the smoothed derivative.
    - peak_prominence: how prominent a derivative peak must be to count
    - drop_threshold: fraction of the peak height where we consider it to have 'dropped back down'
    Ignores offsets where num_spots is NaN or zero.
    """
    offsets = np.array(offsets)
    num_spots = np.array(num_spots_list, dtype=float)

    # Mask invalid
    valid_mask = (~np.isnan(num_spots)) & (num_spots > 0)
    if not np.any(valid_mask):
        return None, None, None

    offsets_valid = offsets[valid_mask]
    num_spots_valid = num_spots[valid_mask]

    # Compute and smooth derivative
    deriv = np.diff(num_spots_valid) / np.diff(offsets_valid)
    deriv_smooth = np.convolve(deriv, np.ones(5)/5, mode='same')

    # Find peaks in the derivative
    peaks, _ = find_peaks(deriv_smooth, prominence=peak_prominence)
    if len(peaks) == 0:
        return offsets_valid, num_spots_valid, ("no_peak", offsets_valid[np.argmax(num_spots_valid)], deriv_smooth)

    first_peak_idx = peaks[0]
    peak_height = deriv_smooth[first_peak_idx]

    print('First peak height:', peak_height)

    # Find the right edge where derivative drops below threshold fraction of peak height
    right_edge_idx = first_peak_idx
    print('Looking for next point where derivative < ', -drop_threshold + peak_height)
    for j in range(first_peak_idx + 1, len(deriv_smooth)):
        
        if deriv_smooth[j] < -drop_threshold + peak_height:
            right_edge_idx = j
            print('Found the point at: ', deriv_smooth[j])
            break

    best_offset = offsets_valid[right_edge_idx]
    return offsets_valid, num_spots_valid, ("derivative_peak", best_offset, deriv_smooth)


def find_best_offset_plateau(offsets, num_spots_list, deriv_threshold=1.0):
    """
    Try to find the right edge of a plateau in the num_spots curve.
    Fallback: return knee point if plateau not found.
    Ignores offsets where num_spots is NaN or zero.
    """
    offsets = np.array(offsets)
    num_spots = np.array(num_spots_list, dtype=float)

    # Mask invalid (NaN or 0)
    valid_mask = (~np.isnan(num_spots)) & (num_spots > 0)
    if not np.any(valid_mask):
        return None, None, None  # No valid points

    offsets_valid = offsets[valid_mask]
    num_spots_valid = num_spots[valid_mask]

    # Compute discrete derivative
    deriv = np.diff(num_spots_valid) / np.diff(offsets_valid)
    
    # Smooth derivative
    deriv_smooth = np.convolve(deriv, np.ones(10)/10, mode='same')
    
    # Plateau detection
    plateau_idx = np.where(np.abs(deriv_smooth) < deriv_threshold)[0]

    if len(plateau_idx) > 0:
        # Group consecutive indices
        groups = np.split(plateau_idx, np.where(np.diff(plateau_idx) != 1)[0] + 1)
        # Pick the longest low-derivative segment
        longest = max(groups, key=len)
        best_idx = longest[-1]
        return offsets_valid, num_spots_valid, ("plateau", offsets_valid[best_idx], deriv_smooth)

    # Fallback: Knee detection
    y = num_spots_valid
    x = offsets_valid
    x_norm = (x - x.min()) / (x.max() - x.min())
    y_norm = (y - y.min()) / (y.max() - y.min())
    distances = np.abs(y_norm - (y_norm[0] + (y_norm[-1] - y_norm[0]) * x_norm))
    knee_idx = np.argmax(distances)
    return offsets_valid, num_spots_valid, ("knee", offsets_valid[knee_idx], deriv_smooth)

# ---- Main loop ----
plt.close('all')
n_images = len(spm_files)
n_cols = 3
n_rows = math.ceil(n_images / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 14 / n_cols * n_rows))
axes = axes.flatten() if n_images > 1 else [axes]

block_size = 19  # px
offsets = [x / 40 for x in range(4, 120)]

best_offsets = []

for i, filename in enumerate(spm_files):
    afm_file = os.path.join(folder_path, f"{filename}_corrected.csv")
    label_file = os.path.join(folder_path, f"{os.path.splitext(filename)[0]}_categorised.csv")

    label_data = pd.read_csv(label_file)
    num_labels = len(label_data)

    afm_data = pd.read_csv(afm_file, header=None).values

    num_spots_list = []

    for offset_nm in offsets:
        local_thresh = filters.threshold_local(afm_data, block_size=block_size, offset=offset_nm)
        dark_spots = afm_data < local_thresh
        labeled_spots, _ = ndi.label(dark_spots)
        spot_sizes = np.bincount(labeled_spots.flat)[1:]
        num_spots = np.sum((spot_sizes >= 6) & (spot_sizes <= 199))
        if num_spots > 200:
            num_spots_list.append(np.nan)  # invalid
        else:
            num_spots_list.append(num_spots if num_spots > 0 else np.nan)

    ax = axes[i]
    # offsets_valid, num_spots_valid, result = find_best_offset_plateau(offsets, num_spots_list)
    offsets_valid, num_spots_valid, result = find_best_offset_first_derivative_peak(offsets, num_spots_list)

    
    if result is None:
        ax.set_title(f"{filename}\nNo valid points")
        continue

    method, best_offset, deriv_smooth = result

    best_offsets.append(best_offset)

    # Plot number of spots
    ax.plot(offsets, num_spots_list, label='Num spots', color='blue')
    ax.axvline(best_offset, color='orange', linestyle='--', label=f'{method} ({best_offset:.2f} nm)')
    ax.hlines(y=num_labels, xmin=0, xmax=offsets[-1], linestyles='dashed', colors='red', label=f'Labels: {num_labels}')

    # Plot derivative on secondary y-axis
    ax2 = ax.twinx()
    ax2.plot(offsets_valid[:-1], deriv_smooth, color='green', linestyle='-', label='Derivative (smoothed)')
    ax2.set_ylabel('Δspots / Δoffset', color='green')

    ax.set_title(filename, fontsize=10)
    ax.set_xlabel("Offset (nm)")
    ax.set_ylabel("Spot count")
    ax.legend(fontsize=6, loc='upper left')
    ax2.legend(fontsize=6, loc='upper right')

    # if i >= 6:
    #     break

plt.tight_layout()
plt.show()


## Plot labelled AFM images individually (Viridis colormap)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
from math import sqrt


# Load the datasets
for filename in spm_files:
    afm_file = os.path.join(folder_path, f"{filename}_corrected.csv")
    label_file = os.path.join(folder_path, f"{os.path.splitext(filename)[0]}_categorised.csv")                    

    afm_data = pd.read_csv(afm_file).values
    labels = pd.read_csv(label_file)

    # Display the shape of the loaded data
    print(f"AFM data shape: {afm_data.shape}")
    print(f"Labels data shape: {labels.shape}")

    # Visualize the AFM data
    plt.figure(figsize=(8, 8))
    plt.imshow(afm_data, cmap='viridis')
    plt.title('AFM Topography')
    plt.colorbar(label='Height (nm)')
    plt.savefig('afm_topography.png')
    plt.close()

    # Visualize the AFM data with labels overlaid
    plt.figure(figsize=(8, 8))
    plt.imshow(afm_data, cmap='viridis')
    plt.title('AFM Topography with Labeled Pits')
    plt.colorbar(label='Height (nm)')

    # Add circles for each labeled pit
    for index, row in labels.iterrows():
        # We use a color that will be visible on the viridis map.
        # The category can be used to color-code the circles if desired.
        color = 'r' if row['category'] == 'large' else 'orange'
        circle = Circle((row['x'], 511-row['y']), sqrt(row['size'])*2, color=color, fill=False, linewidth=2)
        plt.gca().add_patch(circle)

    plt.show()    

# plt.savefig('afm_topography_with_labels.png')
# plt.close()

## Create overlapping histograms for 'size' grouped by 'category'


In [None]:
categories = ['large','small']
plt.figure(figsize=(10, 6))

for category in categories:
    subset = all_labels[all_labels['category'] == category]
    plt.hist(subset['size']*61, bins=20, alpha=0.5, label=category)

plt.xlabel('Pit Size ($nm^2$)')
plt.ylabel('Frequency')
plt.title('Overlapping Histograms of Pit Size by Category')
plt.legend()
plt.show()

## CNN Training Time

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import tensorflow as tf
from tensorflow.keras import layers, models

# -------------------------
# Parameters
# -------------------------
cutout_size = 19
half_size = cutout_size // 2

X = []
y = []

# -------------------------
# Data extraction
# -------------------------
for filename in spm_files:
    afm_file = os.path.join(folder_path, f"{filename}_corrected.csv")
    label_file = os.path.join(folder_path, f"{os.path.splitext(filename)[0]}_categorised.csv")

    afm_data = pd.read_csv(afm_file, header=None).values
    labels = pd.read_csv(label_file)

    # Spot cutouts (label=1)
    for _, row in labels.iterrows():
        if row['category'] not in ['large', 'small']:
            continue
        cx, cy = int(row['x']), int(row['y'])
        # Flip y for correct orientation
        cy = afm_data.shape[0] - 1 - cy
        if cx - half_size < 0 or cx + half_size >= afm_data.shape[1] or cy - half_size < 0 or cy + half_size >= afm_data.shape[0]:
            continue  # skip if too close to border
        patch = afm_data[cy - half_size:cy + half_size + 1, cx - half_size:cx + half_size + 1]
        X.append(patch)
        y.append(1)

        # if row['category'] == 'large':
        #     y.append(1)
        # else:
        #     y.append(0)

    ## ---- Uncomment below lines for spot/no-spot training (also remove the if 'large' statement above)

    # No-spot cutouts (label=0)
    # Sample random positions far from labeled spots
    mask = np.zeros_like(afm_data, dtype=bool)
    for _, row in labels.iterrows():
        rr, cc = int(row['y']), int(row['x'])
        rr = afm_data.shape[0] - 1 - rr
        yy, xx = np.ogrid[:afm_data.shape[0], :afm_data.shape[1]]
        dist = np.sqrt((xx - cc)**2 + (yy - rr)**2)
        mask[dist <= half_size] = True

    n_neg_samples = len(labels)  # balance positives and negatives
    neg_coords = np.argwhere(~mask)
    np.random.shuffle(neg_coords)
    for ny, nx in neg_coords[:n_neg_samples]:
        if nx - half_size < 0 or nx + half_size >= afm_data.shape[1] or ny - half_size < 0 or ny + half_size >= afm_data.shape[0]:
            continue
        patch = afm_data[ny - half_size:ny + half_size + 1, nx - half_size:nx + half_size + 1]
        X.append(patch)
        y.append(0)

    ## ----- end spot/no-spot section

# Convert to arrays
X = np.array(X)
y = np.array(y)

# Normalise height values
X = (X - np.mean(X)) / np.std(X)
X = X[..., np.newaxis]  # add channel dimension


In [None]:

# -------------------------
# Train/test split
# -------------------------
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# -------------------------
# CNN model
# -------------------------
# model = models.Sequential([
#     layers.Conv2D(32, (3, 3), activation='relu', input_shape=(cutout_size, cutout_size, 1)),
#     layers.MaxPooling2D((2, 2)),
#     layers.Conv2D(64, (3, 3), activation='relu'),
#     layers.MaxPooling2D((2, 2)),
#     layers.Flatten(),
#     layers.Dense(64, activation='relu'),
#     layers.Dense(1, activation='sigmoid')  # binary classification
# ])

# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])

# -------------------- OLD Model -----------

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(cutout_size, cutout_size, 1)),  # Updated input_shape to (15, 15, 3)
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# -------------------------
# Training
# -------------------------
history = model.fit(X_train, y_train, epochs=20, batch_size=32,
                    validation_data=(X_val, y_val))

# -------------------------
# Plot loss curves
# -------------------------
plt.figure(figsize=(6, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# -------------------------
# Predictions
# -------------------------
y_pred_probs = model.predict(X_val)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# -------------------------
# Confusion matrix
# -------------------------
cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=['Small', 'Large'])
disp.plot(cmap=plt.cm.Blues)
plt.show()

# -------------------------
# Metrics
# -------------------------
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, zero_division=0)
recall = recall_score(y_val, y_pred, zero_division=0)
f1 = f1_score(y_val, y_pred, zero_division=0)

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")


In [None]:
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, precision_score, recall_score, f1_score, classification_report
)
import matplotlib.pyplot as plt
import numpy as np

# -------------------------
# Predictions
# -------------------------
y_pred_probs = model.predict(X_val)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# -------------------------
# Confusion matrix
# -------------------------
labels = ['Small', 'Large']
cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=labels)

fig, ax = plt.subplots(figsize=(6, 6))
disp.plot(cmap=plt.cm.Blues, ax=ax, colorbar=False)

# -------------------------
# Metrics
# -------------------------
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='binary', zero_division=0)
recall = recall_score(y_val, y_pred, average='binary', zero_division=0)
f1 = f1_score(y_val, y_pred, average='binary', zero_division=0)

# Per-class metrics
class_prec = precision_score(y_val, y_pred, average=None, zero_division=0)
class_rec = recall_score(y_val, y_pred, average=None, zero_division=0)
class_f1 = f1_score(y_val, y_pred, average=None, zero_division=0)

# Build annotation text
textstr = f"Overall:\n" \
          f"Acc: {accuracy:.3f}\n" \
          f"Prec: {precision:.3f}\n" \
          f"Rec: {recall:.3f}\n" \
          f"F1: {f1:.3f}\n\n" \
          f"Per-class:\n" \
          f"{labels[0]} - P:{class_prec[0]:.2f}, R:{class_rec[0]:.2f}, F1:{class_f1[0]:.2f}\n" \
          f"{labels[1]} - P:{class_prec[1]:.2f}, R:{class_rec[1]:.2f}, F1:{class_f1[1]:.2f}"

# Add text box to plot
ax.text(1.05, 0.5, textstr, transform=ax.transAxes, fontsize=10,
        verticalalignment='center', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, precision_score, recall_score, f1_score
)
import matplotlib.pyplot as plt
import numpy as np

# -------------------------
# Predictions
# -------------------------
y_pred_probs = model.predict(X_val)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# -------------------------
# Confusion matrix
# -------------------------
labels = ['Small', 'Large']
cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=labels)

fig, ax = plt.subplots(figsize=(7, 7))
disp.plot(cmap=plt.cm.Blues, ax=ax, colorbar=False)

# -------------------------
# Metrics
# -------------------------
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='binary', zero_division=0)
recall = recall_score(y_val, y_pred, average='binary', zero_division=0)
f1 = f1_score(y_val, y_pred, average='binary', zero_division=0)

# Per-class metrics
class_prec = precision_score(y_val, y_pred, average=None, zero_division=0)
class_rec = recall_score(y_val, y_pred, average=None, zero_division=0)
class_f1 = f1_score(y_val, y_pred, average=None, zero_division=0)

# -------------------------
# Create metrics table data
# -------------------------
table_data = [
    ["Metric", "Overall", labels[0], labels[1]],
    ["Accuracy", f"{accuracy:.3f}", "", ""],
    ["Precision", f"{precision:.3f}", f"{class_prec[0]:.3f}", f"{class_prec[1]:.3f}"],
    ["Recall", f"{recall:.3f}", f"{class_rec[0]:.3f}", f"{class_rec[1]:.3f}"],
    ["F1 Score", f"{f1:.3f}", f"{class_f1[0]:.3f}", f"{class_f1[1]:.3f}"]
]

# -------------------------
# Add table below the confusion matrix
# -------------------------
ax_table = plt.gcf().add_axes([0.15, -0.35, 0.7, 0.25])  # position: [left, bottom, width, height]
ax_table.axis('off')

table = ax_table.table(
    cellText=table_data,
    cellLoc='center',
    colLabels=None,
    loc='center'
)

# Style table
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.5)

# Color header row
for col in range(len(table_data[0])):
    table[(0, col)].set_facecolor('#1f77b4')
    table[(0, col)].set_text_props(color='white', weight='bold')

# Color first column
for row in range(len(table_data)):
    table[(row, 0)].set_facecolor('#c6dcee')
    table[(row, 0)].set_text_props(weight='bold')

plt.subplots_adjust(bottom=0.1)  # make space for table
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Get predictions (probabilities)
y_pred_probs = model.predict(X_val)

# Convert to binary labels
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# Find incorrect predictions
incorrect_indices = np.where(y_pred != y_val)[0]
correct_indices = np.where(y_pred == y_val)[0]

print(f"Number of misclassified images: {len(incorrect_indices)}")

labels = ['No Spot', 'Spot']

# Display them
plt.figure(figsize=(5, 5))
for i, idx in enumerate(correct_indices[[0,1,3,4]]):
    plt.subplot(2, 2, i+1)  # 5x5 grid, adjust as needed
    plt.imshow(X_val[idx].squeeze(), cmap='gray')
    plt.title(f"True: {labels[y_val[idx]]}, Pred: {labels[y_pred[idx]]}", fontsize=10)
    plt.axis('off')
    if i >= 3:  # limit to first 25 for display
        break

plt.tight_layout()
plt.show()


In [None]:
plt.close('all')

In [None]:
plt.imshow(X[0,:,:,:], cmap='gray')

In [None]:
np.unique(y, return_counts=True)