# 03B_Threshold_Merging_SSA

### Developed by SSA 

used for 1D pattern data files within a certain run to identify the best overall Thresholding value for that given run and use this to merge all patterns within that given set. 

### Setup and define file structure 

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import re 

In [None]:
# point to the folder of thresholded.xy files 
datasets_directory = "D:/I11 Beamtime July/RAW_2D/Run_3_X3_0.5VF/03_pyFAI_Thresh_1D_Plot/" #"C:/path/to/your/individual/xy/files/for/merging/"

# Helper function to extract info from filenames
def parse_filename(filename):
    """
    Extracts data_number, background_number, and threshold value from filenames like:
    108045_bg_108069_threshold_25.xy
    """
    match = re.match(r"(\d+)_bg_(\d+)_threshold_(\d+)\.xy", filename)
    if match:
        data_num, bg_num, threshold = match.groups()
        return int(data_num), int(bg_num), int(threshold)
    return None, None, None


In [None]:
# Parse filenames and group by dataset 
# analyses and seperates into logical components

files = [f for f in os.listdir(datasets_directory) if f.endswith(".xy")]

# Dictionary: { data_num: { threshold_value: filepath } }
datasets = {}

for file in files:
    data_num, bg_num, threshold = parse_filename(file)
    if data_num is not None:
        if data_num not in datasets:
            datasets[data_num] = {}
        datasets[data_num][threshold] = os.path.join(datasets_directory, file)

print(f"Found {len(datasets)} unique data sets.")
print("Example entry:")
for k, v in list(datasets.items())[:3]:
    print(f"Data {k}: thresholds = {list(v.keys())}")

# NOT NEEDED BUT CAN BE USEFUL - Prints all data sets with all thresholds used:
    
#for data_num in sorted(datasets.keys()):
    # Sort thresholds numerically if possible
 #   thresholds_sorted = sorted(datasets[data_num].keys(), key=lambda x: float(x))
  #  print(f"Data {data_num}: thresholds = {thresholds_sorted}")

### Visualise all thresholds for each data set 

In [None]:
for data_num, thresholds in datasets.items():
    plt.figure(figsize=(8, 5))
    
    for threshold, path in sorted(thresholds.items()):
        try:
            # skip header row, ignore comments, handle any stray text
            data = np.loadtxt(path, skiprows=1, comments='#')
            two_theta, intensity = data[:, 0], data[:, 1]
            plt.plot(two_theta, intensity, label=f"threshold {threshold}")
        except Exception as e:
            print(f"⚠️ Could not read {path}: {e}")
            continue

    plt.title(f"Data {data_num} — Comparison of Threshold Levels", fontsize=14)
    plt.xlabel(r"$2\theta$ (°)")
    plt.ylabel("Intensity (a.u.)")
    plt.legend()
    plt.show()


### Determine best threshold value 

In [None]:
#Automatically select a single balanced threshold

threshold_scores = {}

for data_num, thresholds in datasets.items():
    for threshold, path in thresholds.items():
        try:
            data = np.loadtxt(path, skiprows=1)
            two_theta, intensity = data[:, 0], data[:, 1]

            #Balanced "peak preservation" metric
            mean_int = np.mean(intensity)
            std_int = np.std(intensity)
            noise_factor = np.mean(np.abs(np.diff(intensity))) / np.max(intensity)

            # Score high for sharp peaks but penalise excessive noise
            score = (std_int / mean_int) * (1 - noise_factor)

            threshold_scores.setdefault(threshold, []).append(score)

        except Exception as e:
            print(f"Could not process {path}: {e}")
            continue

# Average scores per threshold
mean_scores = {t: np.mean(s) for t, s in threshold_scores.items() if len(s) > 0}

print("\n Balanced 'peak preservation' scores per threshold:")
for t, score in sorted(mean_scores.items(), key=lambda x: float(x[0])):
    print(f"Threshold {t}: mean score = {score:.4f}")

# Pick best threshold
best_threshold_auto = max(mean_scores, key=mean_scores.get)
print(f"\n Automatically selected balanced global threshold = {best_threshold_auto}")

# Manual override option
manual_override = 65  # e.g. set manual_override = 25 to force a value

if manual_override is not None:
    best_threshold = manual_override
    print(f"✏️ Manual override applied: using threshold = {best_threshold}")
else:
    best_threshold = best_threshold_auto

print(f"\n✅ Final chosen threshold for all datasets = {best_threshold}")

# Plot: score vs threshold
plt.figure(figsize=(7, 5))
threshold_values = sorted(mean_scores.keys())
scores = [mean_scores[t] for t in threshold_values]

plt.plot(threshold_values, scores, marker='o', linestyle='-', color='royalblue')
plt.axvline(best_threshold, color='red', linestyle='--', label=f"Selected = {best_threshold}")
plt.title("Balanced 'Peak Preservation' Score vs Threshold", fontsize=14)
plt.xlabel("Threshold Value")
plt.ylabel("Mean Score (higher = better)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# Extract all data files using the chosen best threshold

selected_intensities = []
selected_files = []

for data_num, thresholds in datasets.items():
    if best_threshold in thresholds:
        path = thresholds[best_threshold]
        data = np.loadtxt(path, skiprows=1)
        two_theta, intensity = data[:, 0], data[:, 1]
        selected_intensities.append(intensity)
        selected_files.append(path)
    else:
        print(f"No file found for Data {data_num} with threshold {best_threshold}")

print(f"\n Selected {len(selected_intensities)} files for merging using threshold = {best_threshold}")

### Merging into single PXRD pattern

In [None]:
# Merge chosen threshold data into a single averaged pattern

if selected_intensities:
    merged_array = np.array(selected_intensities)
    avg_intensity = np.mean(merged_array, axis=0)
    std_intensity = np.std(merged_array, axis=0)    
    
    plt.figure(figsize=(8, 5))
    plt.plot(two_theta, avg_intensity, color='black', label='Merged pattern')
    plt.fill_between(two_theta, avg_intensity - std_intensity, avg_intensity + std_intensity,
                     color='gray', alpha=0.3, label='±1σ')
    plt.title(f"Merged PXRD Pattern (Threshold = {best_threshold})", fontsize=14)
    plt.xlabel(r"$2\theta$ (°)")
    plt.ylabel("Intensity (a.u.)")
    plt.legend()
    plt.show()
else:
    print("❌ No valid data to merge.")

merged_data = np.column_stack((two_theta, avg_intensity))


In [None]:
# Zoomed-in view of merged data with optional manual zoom 

if 'merged_data' not in locals():
    raise NameError("⚠️ 'merged_data' not found. Please run the merging cell first.")

# Optional manual zoom limits
# Set to None to automatically use full range
x_min, x_max = 0.5, 25  # e.g. 5, 25
y_min, y_max = -0.03, 3.5  # e.g. 0, 100

two_theta = merged_data[:, 0]
intensity = merged_data[:, 1]

# Auto-assign limits if not specified
if x_min is None:
    x_min = two_theta.min()
if x_max is None:
    x_max = two_theta.max()
if y_min is None:
    y_min = intensity.min()
if y_max is None:
    y_max = intensity.max()

# Plot
plt.figure(figsize=(8, 5))
plt.plot(two_theta, intensity, label='Merged thresholded data', color='blue')
plt.title(f'Zoomed-in Merged Data (Threshold = {best_threshold})', fontsize=14)
plt.xlabel('$2\\theta$ (°)', fontsize=12)
plt.ylabel('Intensity (a.u.)', fontsize=12)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


### Save Merged Output 

In [None]:
# Save merged data (.xy and .png) with threshold in name

# Ensure these variables are set beforehand:
# best_threshold = 25
run_id = "Run_3_X3_0.5VF"
base_directory = "D:/I11 Beamtime July/RAW_2D/Run_3_X3_0.5VF/"

# Create folder if it doesn't exist
save_dir = os.path.join(base_directory, '03B_merged_thresholded')
os.makedirs(save_dir, exist_ok=True)

# Build filenames (no timestamp)
merged_filename = f"{run_id}_merged_thresholded_thresh_{best_threshold}"
xy_path = os.path.join(save_dir, merged_filename + ".xy")
png_path = os.path.join(save_dir, merged_filename + ".png")

# save merged .xy file 
np.savetxt(xy_path, merged_data, fmt='%.6f', header='2theta  Intensity', comments='')
print(f"Saved merged data as: {xy_path}")

#save merged .png 
plt.figure(figsize=(8, 5))
plt.plot(merged_data[:, 0], merged_data[:, 1], color='blue')
plt.title(f"Merged Thresholded Data: {run_id} (Threshold = {best_threshold})")
plt.xlabel('$2\\theta$ (°)')
plt.ylabel('Intensity (a.u.)')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(png_path, dpi=300)
plt.close()
print(f"Saved plot as: {png_path}")
