In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

In [2]:
# Paths
CSV_PATH = "../data/encoded_data.csv"
OUT_SCALERS = "scalers.joblib"
OUT_SPLITS = "data_splits.npz"

In [3]:
# Load dataset
df = pd.read_csv(CSV_PATH)

In [4]:
# Define conditioning and property targets
cond_cols = ["Testing_Temp"]         # conditioning variable(s)
prop_col = "Yield_Strength"          # supervised target

In [5]:
# Feature columns (everything except targets + percentages)
feature_cols = [
    c for c in df.columns 
    if c not in cond_cols + [prop_col] and not c.endswith("_pct")
]

In [6]:
# Build X, y_cond, y_prop
X = df[feature_cols].copy()
y_cond = df[cond_cols].copy()
y_prop = df[[prop_col]].copy()

In [7]:
# Handle missing values
X = X.fillna(X.median(numeric_only=True))
y_cond = y_cond.fillna(y_cond.median(numeric_only=True))
y_prop = y_prop.fillna(y_prop.median(numeric_only=True))

In [8]:
# Train/val split
X_train, X_val, y_cond_train, y_cond_val, y_prop_train, y_prop_val = train_test_split(
    X.values, y_cond.values, y_prop.values,
    test_size=0.2, random_state=42
)

In [9]:
# Scale features
x_scaler = StandardScaler()
y_cond_scaler = StandardScaler()   # keep conditioning in scaled units
y_prop_scaler = MinMaxScaler()     # keep target in [0,1]

X_train_s = x_scaler.fit_transform(X_train)
X_val_s   = x_scaler.transform(X_val)

y_cond_train_s = y_cond_scaler.fit_transform(y_cond_train)
y_cond_val_s   = y_cond_scaler.transform(y_cond_val)

y_prop_train_s = y_prop_scaler.fit_transform(y_prop_train)
y_prop_val_s   = y_prop_scaler.transform(y_prop_val)

In [10]:
# Save scalers and splits
joblib.dump({
    "x_scaler": x_scaler,
    "y_cond_scaler": y_cond_scaler,
    "y_prop_scaler": y_prop_scaler,
    "feature_cols": feature_cols,
    "cond_cols": cond_cols,
    "prop_col": prop_col,
}, OUT_SCALERS)

np.savez(OUT_SPLITS,
         X_train=X_train_s, X_val=X_val_s,
         y_cond_train=y_cond_train_s, y_cond_val=y_cond_val_s,
         y_prop_train=y_prop_train_s, y_prop_val=y_prop_val_s)

print(f"Saved splits → {OUT_SPLITS}")
print(f"Feature dims: X={X_train_s.shape[1]}, y_cond={y_cond_train_s.shape[1]}, y_prop={y_prop_train_s.shape[1]}")


Saved splits → data_splits.npz
Feature dims: X=20, y_cond=1, y_prop=1
