In [3]:
import numpy as np
import h5py
import random
import os
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pytorch_lightning import LightningModule

garstec_data = r'C:\Users\kiena\Documents\YEAR 4\PROJECT\Data\Garstec_AS09_chiara.hdf5'


In [4]:
# 7 Inputs
ages = []
massini = []
fehini = []
alphamlt = []
yini = []
eta = []
alphafe = []

# 5 Outputs
teff = []
luminosity = []
dnufit = []
FeH = []
numax = []

# Open the hdf5 file (read-only mode)
with h5py.File(garstec_data, 'r') as hdf:
    grid = hdf['grid']
    tracks = grid['tracks']

    # Get a list of track names and shuffle for random sampling
    track_names = list(tracks.keys())
    random.seed(1)
    random.shuffle(track_names)

    # Choose a subset of tracks to process (or not)
    selected_tracks = track_names[:]

    for track_name in selected_tracks:  # Iterate over the selected track names
        track = tracks[track_name]
        # Inputs
        ages.append(track['age'][:])
        massini.append(track['massini'][:])
        fehini.append(track['FeHini'][:])
        alphamlt.append(track['alphaMLT'][:])
        yini.append(track['yini'][:])
        eta.append(track['eta'][:])
        alphafe.append(track['alphaFe'][:])

        # Outputs
        teff.append(track['Teff'][:])
        luminosity.append(track['LPhot'][:])
        dnufit.append(track['dnufit'][:])
        FeH.append(track['FeH'][:])
        numax.append(track['numax'][:])

# Convert lists to numpy arrays and concatenate directly (no log transformation)
input_arrays = [
    np.concatenate(ages).reshape(-1, 1),
    np.concatenate(massini).reshape(-1, 1),
    np.concatenate(fehini).reshape(-1, 1),
    np.concatenate(alphamlt).reshape(-1, 1),
    np.concatenate(yini).reshape(-1, 1),
    np.concatenate(eta).reshape(-1, 1),
    np.concatenate(alphafe).reshape(-1, 1)
]

# Concatenate all inputs
inputs = np.hstack(input_arrays)

# Concatenate all outputs
output_arrays = [
    np.concatenate(teff).reshape(-1, 1),
    np.concatenate(luminosity).reshape(-1, 1),
    np.concatenate(dnufit).reshape(-1, 1),
    np.concatenate(numax).reshape(-1, 1),
    np.concatenate(FeH).reshape(-1, 1)
]

# Combine outputs
outputs = np.hstack(output_arrays)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.2, random_state=1)

# Scale data
input_scaler = StandardScaler()
output_scaler = StandardScaler()

X_train_scaled = input_scaler.fit_transform(X_train)
X_test_scaled = input_scaler.transform(X_test)

y_train_scaled = output_scaler.fit_transform(y_train)
y_test_scaled = output_scaler.transform(y_test)

# Store scaler parameters
output_scaler_mean = output_scaler.mean_
output_scaler_scale = output_scaler.scale_

In [10]:
# Load trained model for inverse transform check
checkpoint_path = r"best_model_noLog_v11---epoch=6008-val_loss=41.62628174.ckpt"

class GarstecNet(LightningModule):
    def __init__(self, input_dim, output_dim, output_scaler_mean, output_scaler_scale):
        super().__init__()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_dim, 512),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(512, 256),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(256, 128),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(128, 64),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(64, 32),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(32, output_dim)
        )
        self.register_buffer('output_mean', torch.tensor(output_scaler_mean, dtype=torch.float32))
        self.register_buffer('output_scale', torch.tensor(output_scaler_scale, dtype=torch.float32))

    def inverse_transform(self, scaled_tensor):
        # Ensure the tensor is on the same device as the stored buffers
        return scaled_tensor.to(self.output_scale.device) * self.output_scale + self.output_mean



# Load the model
model = GarstecNet.load_from_checkpoint(
    checkpoint_path, 
    input_dim=X_train.shape[1], 
    output_dim=y_train.shape[1], 
    output_scaler_mean=output_scaler_mean,
    output_scaler_scale=output_scaler_scale
)


In [15]:
# Check stored scaler parameters
print("Output Scaler Mean:\n", output_scaler.mean_)
print("Output Scaler Scale:\n", output_scaler.scale_)

Output Scaler Mean:
 [ 4.95140313e+03  6.36342333e+01  6.63054326e+00  2.33001172e-02
 -7.78173600e-01]
Output Scaler Scale:
 [4.10161448e+02 4.32854629e+01 6.79452131e+00 3.29864457e-02
 6.13345625e-01]


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to correct device
model = model.to(device)

# Ensure input tensor is on the same device
y_test_scaled_tensor = torch.tensor(y_test_scaled, dtype=torch.float32).to(device)

# Now it should work without a device mismatch error
y_test_unscaled_model = model.inverse_transform(y_test_scaled_tensor).detach().cpu().numpy()

y_test_unscaled_manual = y_test_scaled * output_scaler.scale_ + output_scaler.mean_

# Compute absolute and relative differences
abs_diff = np.abs(y_test_unscaled_model - y_test_unscaled_manual)
rel_diff = abs_diff / (np.abs(y_test_unscaled_manual) + 1e-10)  # Small epsilon to avoid division by zero

# Print summary statistics
print("Max Absolute Difference: ", np.max(abs_diff))
print("Mean Absolute Difference: ", np.mean(abs_diff))
print("Max Relative Difference: ", np.max(rel_diff))
print("Mean Relative Difference: ", np.mean(rel_diff))

# Check log-transformed values (first 4 columns)
for i in range(4):  
    print(f"Feature {i}: Checking log transform reversibility")
    model_recovered = 10 ** y_test_unscaled_model[:, i]
    manual_recovered = 10 ** y_test_unscaled_manual[:, i]
    
    log_abs_diff = np.abs(model_recovered - manual_recovered)
    log_rel_diff = log_abs_diff / (np.abs(manual_recovered) + 1e-10)

    print(f"  Max Absolute Log-Difference: {np.max(log_abs_diff)}")
    print(f"  Mean Absolute Log-Difference: {np.mean(log_abs_diff)}")
    print(f"  Max Relative Log-Difference: {np.max(log_rel_diff)}")
    print(f"  Mean Relative Log-Difference: {np.mean(log_rel_diff)}")

Max Absolute Difference:  0.0008871249992807861
Mean Absolute Difference:  4.002819973301082e-05
Max Relative Difference:  0.006157867241702994
Mean Relative Difference:  1.2025641625345485e-07
Feature 0: Checking log transform reversibility


  model_recovered = 10 ** y_test_unscaled_model[:, i]
  manual_recovered = 10 ** y_test_unscaled_manual[:, i]
  log_abs_diff = np.abs(model_recovered - manual_recovered)


  Max Absolute Log-Difference: nan
  Mean Absolute Log-Difference: nan
  Max Relative Log-Difference: nan
  Mean Relative Log-Difference: nan
Feature 1: Checking log transform reversibility
  Max Absolute Log-Difference: inf
  Mean Absolute Log-Difference: inf
  Max Relative Log-Difference: inf
  Mean Relative Log-Difference: inf
Feature 2: Checking log transform reversibility
  Max Absolute Log-Difference: inf
  Mean Absolute Log-Difference: inf
  Max Relative Log-Difference: inf
  Mean Relative Log-Difference: inf
Feature 3: Checking log transform reversibility
  Max Absolute Log-Difference: 1.2243287828006544e-07
  Mean Absolute Log-Difference: 3.00580756901602e-08
  Max Relative Log-Difference: 7.576130627521522e-08
  Mean Relative Log-Difference: 2.852428749383892e-08


In [4]:
# Create directories for output
os.makedirs("outlier_analysis", exist_ok=True)
os.makedirs("input_outlier_analysis", exist_ok=True)
os.makedirs("output_outlier_analysis", exist_ok=True)
os.makedirs("luminosity_analysis", exist_ok=True)
os.makedirs("correlation_analysis", exist_ok=True)

# Define column names based on your data
input_names = ['age', 'massini', 'fehini', 'alphamlt', 'yini', 'eta', 'alphafe']
output_names = ['teff', 'luminosity', 'dnufit', 'numax', 'FeH']

In [16]:
def get_basic_stats(data):
    """Calculate basic statistics for a dataset"""
    return {
        'min': np.min(data),
        'max': np.max(data),
        'mean': np.mean(data),
        'median': np.median(data),
        'std': np.std(data)
    }

def find_zscore_outliers(data, threshold=3):
    """Find outliers using Z-score method"""
    z_scores = stats.zscore(data)
    return np.where(np.abs(z_scores) > threshold)[0]

def find_iqr_outliers(data):
    """Find outliers using IQR method"""
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    iqr_lower = q1 - 1.5 * iqr
    iqr_upper = q3 + 1.5 * iqr
    
    return np.where((data < iqr_lower) | (data > iqr_upper))[0], (iqr_lower, iqr_upper)

def find_modified_zscore_outliers(data, threshold=3.5):
    """Find outliers using modified Z-score method (more robust)"""
    median_val = np.median(data)
    mad = stats.median_abs_deviation(data)
    
    if mad == 0:  # Avoid division by zero
        return np.array([])
        
    modified_z = 0.6745 * (data - median_val) / mad
    return np.where(np.abs(modified_z) > threshold)[0]

def print_outlier_summary(data, column_name):
    """Print a summary of outliers detected by different methods"""
    stats_dict = get_basic_stats(data)
    
    print(f"\n{'='*50}")
    print(f"Analyzing column: {column_name}")
    print(f"{'='*50}")
    
    print(f"Min: {stats_dict['min']:.6f}")
    print(f"Max: {stats_dict['max']:.6f}")
    print(f"Mean: {stats_dict['mean']:.6f}")
    print(f"Median: {stats_dict['median']:.6f}")
    print(f"Std Dev: {stats_dict['std']:.6f}")
    
    # Check for infinities or NaN
    inf_vals = np.isinf(data)
    nan_vals = np.isnan(data)
    print(f"Infinity values: {np.sum(inf_vals)}")
    print(f"NaN values: {np.sum(nan_vals)}")
    
    # Z-score method
    z_outliers = find_zscore_outliers(data)
    print(f"\nZ-score outliers (|z| > 3): {len(z_outliers)} points")
    if len(z_outliers) > 0:
        print(f"Sample outlier indices: {z_outliers[:5]} ...")
        print(f"Sample outlier values: {data[z_outliers[:5]]} ...")
    
    # IQR method
    iqr_outliers, iqr_bounds = find_iqr_outliers(data)
    print(f"\nIQR outliers: {len(iqr_outliers)} points")
    print(f"IQR range: [{iqr_bounds[0]:.6f}, {iqr_bounds[1]:.6f}]")
    if len(iqr_outliers) > 0:
        print(f"Sample outlier indices: {iqr_outliers[:5]} ...")
        print(f"Sample outlier values: {data[iqr_outliers[:5]]} ...")
    
    # Modified Z-score
    mod_z_outliers = find_modified_zscore_outliers(data)
    print(f"\nModified Z-score outliers: {len(mod_z_outliers)} points")
    if len(mod_z_outliers) > 0:
        print(f"Sample outlier indices: {mod_z_outliers[:5]} ...")
        print(f"Sample outlier values: {data[mod_z_outliers[:5]]} ...")
        
    return {
        'stats': stats_dict,
        'z_score_outliers': z_outliers,
        'iqr_outliers': iqr_outliers,
        'modified_z_outliers': mod_z_outliers,
        'inf_count': np.sum(inf_vals),
        'nan_count': np.sum(nan_vals)
    }

if __name__ == "__main__":
    # Example: Check the first input parameter
    column_index = 0  # Change this to analyze different columns
    column_name = input_names[column_index]
    column_data = inputs[:, column_index]
    
    results = print_outlier_summary(column_data, column_name)


Analyzing column: age
Min: 827.484820
Max: 20000.000000
Mean: 6377.355143
Median: 4822.932926
Std Dev: 4615.980428
Infinity values: 0
NaN values: 0

Z-score outliers (|z| > 3): 0 points

IQR outliers: 177951 points
IQR range: [-6441.026614, 18075.186904]
Sample outlier indices: [16169 16170 16171 16172 16173] ...
Sample outlier values: [19697.22429 19697.35745 19697.65001 19698.29277 19699.7049 ] ...

Modified Z-score outliers: 206452 points
Sample outlier indices: [16169 16170 16171 16172 16173] ...
Sample outlier values: [19697.22429 19697.35745 19697.65001 19698.29277 19699.7049 ] ...


In [None]:
def plot_outlier_analysis(data, column_name, output_dir="outlier_analysis"):
    """Create visualizations for outlier analysis"""
    stats_dict = get_basic_stats(data)
    z_outliers = find_zscore_outliers(data)
    
    plt.figure(figsize=(12, 10))
    
    # Histogram with KDE
    plt.subplot(2, 2, 1)
    sns.histplot(data, kde=True)
    plt.title(f"{column_name} Distribution")
    plt.axvline(stats_dict['mean'], color='r', linestyle='--', 
               label=f"Mean: {stats_dict['mean']:.4f}")
    plt.axvline(stats_dict['median'], color='g', linestyle='--', 
               label=f"Median: {stats_dict['median']:.4f}")
    plt.axvline(stats_dict['mean'] + 3*stats_dict['std'], color='orange', 
               linestyle=':', label='+3σ')
    plt.axvline(stats_dict['mean'] - 3*stats_dict['std'], color='orange', 
               linestyle=':', label='-3σ')
    plt.legend()
    
    # Box plot
    plt.subplot(2, 2, 2)
    sns.boxplot(y=data)
    plt.title(f"{column_name} Box Plot")
    
    # Q-Q plot
    plt.subplot(2, 2, 3)
    stats.probplot(data, plot=plt)
    plt.title(f"{column_name} Q-Q Plot")
    
    # Highlighted outliers
    plt.subplot(2, 2, 4)
    plt.scatter(range(len(data)), data, s=2, alpha=0.5, label='Data')
    
    if len(z_outliers) > 0:
        plt.scatter(z_outliers, data[z_outliers], 
                  color='red', s=10, alpha=0.7, label='Z-score Outliers')
    
    plt.title(f"{column_name} Data Points with Outliers")
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{column_name}_outlier_analysis.png")
    plt.close()

# Example of how to use this section:
if __name__ == "__main__":
    # Example: Visualize the first input parameter
    column_index = 0  # Change this to visualize different columns
    column_name = input_names[column_index]
    column_data = inputs[:, column_index]
    
    plot_outlier_analysis(column_data, column_name, "input_outlier_analysis")

In [17]:
def analyze_luminosity(luminosity_data):
    """Special tests for luminosity outliers"""
    print(f"\n{'='*50}")
    print("In-depth Luminosity Analysis")
    print(f"{'='*50}")
    
    # Check for physically impossible values (negative luminosity)
    neg_lum = np.where(luminosity_data < 0)[0]
    print(f"Negative luminosity values: {len(neg_lum)}")
    if len(neg_lum) > 0:
        print(f"Negative luminosity indices: {neg_lum[:5]} ...")
        print(f"Negative luminosity values: {luminosity_data[neg_lum[:5]]} ...")
    
    # Check for extreme luminosity values (e.g., > 10^6 L☉ for normal stars)
    extreme_high = np.where(luminosity_data > 1e6)[0]
    print(f"\nExtremely high luminosity values (>10^6): {len(extreme_high)}")
    if len(extreme_high) > 0:
        print(f"High luminosity indices: {extreme_high[:5]} ...")
        print(f"High luminosity values: {luminosity_data[extreme_high[:5]]} ...")
    
    # Get positive values for log transform
    positive_lum = luminosity_data[luminosity_data > 0]
    
    # Log transform for better visualization
    if len(positive_lum) > 0:
        log_lum = np.log10(positive_lum)
        
        # Calculate statistics on log scale
        log_mean = np.mean(log_lum)
        log_std = np.std(log_lum)
        log_outliers = np.where(np.abs(log_lum - log_mean) > 3 * log_std)[0]
        
        print(f"\nLog-scale outliers: {len(log_outliers)} points")
        print(f"Log-scale mean: {log_mean:.4f}")
        print(f"Log-scale std dev: {log_std:.4f}")
        
        # Plot log-transformed luminosity
        plt.figure(figsize=(10, 6))
        sns.histplot(log_lum, kde=True)
        plt.title("Log10(Luminosity) Distribution")
        plt.xlabel("Log10(L/L☉)")
        plt.savefig("luminosity_analysis/log_luminosity_distribution.png")
        plt.close()
    else:
        print("\nWarning: No positive luminosity values for log transform")
        log_outliers = []
    
    return {
        'negative_count': len(neg_lum),
        'extreme_high_count': len(extreme_high),
        'log_outliers_count': len(log_outliers) if 'log_outliers' in locals() else 0
    }

# Example of how to use this section:
if __name__ == "__main__":
    # Find luminosity index in your output columns
    luminosity_index = output_names.index('luminosity')
    luminosity_data = outputs[:, luminosity_index]
    
    lum_results = analyze_luminosity(luminosity_data)


In-depth Luminosity Analysis
Negative luminosity values: 0

Extremely high luminosity values (>10^6): 0

Log-scale outliers: 28631 points
Log-scale mean: 1.6738
Log-scale std dev: 0.3763


In [None]:
def plot_correlations_with_luminosity(inputs, outputs, input_names, output_names):
    """Create correlation plots between luminosity and other parameters"""
    # Find luminosity index
    luminosity_index = output_names.index('luminosity')
    luminosity_data = outputs[:, luminosity_index]
    
    # Calculate z-scores for luminosity to highlight outliers
    z_scores = stats.zscore(luminosity_data)
    outliers = np.abs(z_scores) > 3
    
    # Correlations with inputs
    for i, name in enumerate(input_names):
        plt.figure(figsize=(10, 6))
        plt.scatter(inputs[:, i], luminosity_data, s=3, alpha=0.5, label='Normal')
        if np.any(outliers):
            plt.scatter(inputs[outliers, i], luminosity_data[outliers], 
                      color='red', s=10, alpha=0.7, label='Luminosity Outliers')
        plt.xlabel(name)
        plt.ylabel('Luminosity')
        plt.title(f'{name} vs Luminosity')
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"correlation_analysis/{name}_vs_luminosity.png")
        plt.close()
    
    # Correlations with other outputs
    for i, name in enumerate(output_names):
        if name == 'luminosity':
            continue
        plt.figure(figsize=(10, 6))
        plt.scatter(outputs[:, i], luminosity_data, s=3, alpha=0.5, label='Normal')
        if np.any(outliers):
            plt.scatter(outputs[outliers, i], luminosity_data[outliers], 
                      color='red', s=10, alpha=0.7, label='Luminosity Outliers')
        plt.xlabel(name)
        plt.ylabel('Luminosity')
        plt.title(f'{name} vs Luminosity')
        plt.legend()
        plt.tight_layout() 
        plt.savefig(f"correlation_analysis/{name}_vs_luminosity.png")
        plt.close()

# Example of how to use this section:
if __name__ == "__main__":
    # Uncomment to run this section
    plot_correlations_with_luminosity(inputs, outputs, input_names, output_names)

  plt.tight_layout()
  plt.savefig(f"correlation_analysis/{name}_vs_luminosity.png")


KeyboardInterrupt: 

In [5]:
# Find the luminosity index in your output columns
luminosity_index = output_names.index('luminosity')

# Extract the luminosity data
luminosity_data = outputs[:, luminosity_index]

# Find the maximum value
max_luminosity = np.max(luminosity_data)
print(f"Maximum luminosity value: {max_luminosity}")

# Find the index of the maximum value
max_index = np.argmax(luminosity_data)
print(f"Index of maximum luminosity: {max_index}")

# If you want to find the corresponding track and point
# You'll need to determine which track this belongs to
# This requires mapping back to the original data structure

Maximum luminosity value: 250.81987776911078
Index of maximum luminosity: 5071885
