In [1]:
import numpy as np
import h5py
import random
import os
import sys

In [2]:
# Windows directory
garstec_data = r'C:\Users\kiena\Documents\YEAR 4\PROJECT\Data\Garstec_AS09_chiara.hdf5'
save_dir = r'C:\Users\kiena\Documents\YEAR 4\PROJECT\Data'
os.makedirs(save_dir, exist_ok=True)

# 7 Inputs
ages = []
massini = []
fehini = []
alphamlt = []
yini = []
eta = []
alphafe = []

# 5 Outputs (removed massfin, G_GAIA, and MeH)
teff = []
luminosity = []
dnufit = []
FeH = []
numax = []

# Open the hdf5 file (read-only mode)
with h5py.File(garstec_data, 'r') as hdf:
    grid = hdf['grid']
    tracks = grid['tracks']

    # Get a list of track names and shuffle for random sampling
    track_names = list(tracks.keys())
    random.seed(1)
    random.shuffle(track_names)

    # Choose a subset of tracks to process (or not)
    selected_tracks = track_names[:]

    for track_name in selected_tracks:  # Iterate over the selected track names
        track = tracks[track_name]
        # Inputs
        ages.append(track['age'][:])
        massini.append(track['massini'][:])
        fehini.append(track['FeHini'][:])
        alphamlt.append(track['alphaMLT'][:])
        yini.append(track['yini'][:])
        eta.append(track['eta'][:])
        alphafe.append(track['alphaFe'][:])

        # Outputs (removed massfin, G_GAIA, and MeH)
        teff.append(track['Teff'][:])
        luminosity.append(track['LPhot'][:])
        dnufit.append(track['dnufit'][:])
        FeH.append(track['FeH'][:])
        numax.append(track['numax'][:])

# Convert lists to numpy arrays and concatenate 
# Define a small constant to avoid log10(0)
epsilon = 1e-10

# Features requiring log10 transformation
log10_vars_inputs = [ages, massini, alphamlt, eta, yini]

# Transform log10 variables
log10_transformed_inputs = [np.log10(np.maximum(np.concatenate(var).reshape(-1, 1), epsilon)) for var in log10_vars_inputs]

# Concatenate all inputs, including raw `fehini` and `yini`
inputs = np.hstack(log10_transformed_inputs + [np.concatenate(fehini).reshape(-1, 1), 
                                             np.concatenate(alphafe).reshape(-1, 1)])

# Features requiring log10 transformation (strictly positive outputs)
log10_vars_outputs = [teff, luminosity, dnufit, numax]  # Removed massfin

# Transform log10 variables
log10_transformed_outputs = [np.log10(np.maximum(np.concatenate(var).reshape(-1, 1), epsilon)) for var in log10_vars_outputs]

# Combine transformed log10 outputs with raw FeH (removed MeH and G_GAIA)
outputs = np.hstack(log10_transformed_outputs + [np.concatenate(FeH).reshape(-1, 1)])


In [3]:
# After your current transformations, add this verification code:

# ===== Verification of input transformations =====
print("Verifying input transformations...")

# Create a dictionary to store original values for each variable
original_inputs = {
    'ages': np.concatenate(ages),
    'massini': np.concatenate(massini),
    'alphamlt': np.concatenate(alphamlt),
    'eta': np.concatenate(eta),
    'yini': np.concatenate(yini),
    'fehini': np.concatenate(fehini),
    'alphafe': np.concatenate(alphafe)
}

# Extract the log-transformed columns from inputs
log_transformed_cols = {}
for i, var_name in enumerate(['ages', 'massini', 'alphamlt', 'eta', 'yini']):
    log_transformed_cols[var_name] = inputs[:, i]

# Inverse transform (10^x)
inverse_transformed = {}
for var_name in ['ages', 'massini', 'alphamlt', 'eta', 'yini']:
    inverse_transformed[var_name] = 10**log_transformed_cols[var_name]

# Compare with original values
for var_name in ['ages', 'massini', 'alphamlt', 'eta', 'yini']:
    original = original_inputs[var_name]
    inverted = inverse_transformed[var_name]
    
    # Calculate relative error
    relative_error = np.abs((original - inverted) / (original + epsilon))
    max_error = np.max(relative_error)
    mean_error = np.mean(relative_error)
    
    print(f"{var_name}:")
    print(f"  Max relative error: {max_error:.8f}")
    print(f"  Mean relative error: {mean_error:.8f}")
    print(f"  Original range: [{np.min(original):.8f}, {np.max(original):.8f}]")
    print(f"  Inverted range: [{np.min(inverted):.8f}, {np.max(inverted):.8f}]")
    
    # Check if all values are within 0.1% relative error
    all_within_tolerance = np.all(relative_error < 0.00001)
    print(f"  All values within 0.1% error: {all_within_tolerance}")
    print()

# ===== Verification of output transformations =====
print("Verifying output transformations...")

# Create a dictionary to store original values for each variable
original_outputs = {
    'teff': np.concatenate(teff),
    'luminosity': np.concatenate(luminosity),
    'dnufit': np.concatenate(dnufit),
    'numax': np.concatenate(numax),
    'FeH': np.concatenate(FeH)
}

# Extract the log-transformed columns from outputs
log_transformed_output_cols = {}
for i, var_name in enumerate(['teff', 'luminosity', 'dnufit', 'numax']):
    log_transformed_output_cols[var_name] = outputs[:, i]

# Inverse transform (10^x)
inverse_transformed_outputs = {}
for var_name in ['teff', 'luminosity', 'dnufit', 'numax']:
    inverse_transformed_outputs[var_name] = 10**log_transformed_output_cols[var_name]

# Compare with original values
for var_name in ['teff', 'luminosity', 'dnufit', 'numax']:
    original = original_outputs[var_name]
    inverted = inverse_transformed_outputs[var_name]
    
    # Calculate relative error
    relative_error = np.abs((original - inverted) / (original + epsilon))
    max_error = np.max(relative_error)
    mean_error = np.mean(relative_error)
    
    print(f"{var_name}:")
    print(f"  Max relative error: {max_error:.8f}")
    print(f"  Mean relative error: {mean_error:.8f}")
    print(f"  Original range: [{np.min(original):.8f}, {np.max(original):.8f}]")
    print(f"  Inverted range: [{np.min(inverted):.8f}, {np.max(inverted):.8f}]")
    
    # Check if all values are within 0.1% relative error
    all_within_tolerance = np.all(relative_error < 0.001)
    print(f"  All values within 0.1% error: {all_within_tolerance}")
    print()

# Also check the raw FeH values (not log-transformed)
feh_original = original_outputs['FeH']
feh_in_outputs = outputs[:, 4]  # Assuming FeH is the 5th column (index 4)
feh_error = np.abs(feh_original - feh_in_outputs)
print(f"FeH (raw, not log-transformed):")
print(f"  Max absolute error: {np.max(feh_error):.8f}")
print(f"  Mean absolute error: {np.mean(feh_error):.8f}")
print(f"  All values equal: {np.allclose(feh_original, feh_in_outputs)}")

Verifying input transformations...
ages:
  Max relative error: 0.00000000
  Mean relative error: 0.00000000
  Original range: [827.48482040, 20000.00000000]
  Inverted range: [827.48482040, 20000.00000000]
  All values within 0.1% error: True

massini:
  Max relative error: 0.00000000
  Mean relative error: 0.00000000
  Original range: [0.70000000, 1.50000000]
  Inverted range: [0.70000000, 1.50000000]
  All values within 0.1% error: True

alphamlt:
  Max relative error: 0.00000000
  Mean relative error: 0.00000000
  Original range: [1.50004883, 2.29990234]
  Inverted range: [1.50004883, 2.29990234]
  All values within 0.1% error: True

eta:
  Max relative error: 0.00000000
  Mean relative error: 0.00000000
  Original range: [0.00003662, 0.29996338]
  Inverted range: [0.00003662, 0.29996338]
  All values within 0.1% error: True

yini:
  Max relative error: 0.00000000
  Mean relative error: 0.00000000
  Original range: [0.22001587, 0.34999207]
  Inverted range: [0.22001587, 0.34999207]
