In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

In [2]:
# Paths
CSV_PATH = "../data/encoded_data.csv"  # model-ready numeric CSV (fractions, numeric props, encoded cats)

# Load
df = pd.read_csv(CSV_PATH)

In [3]:
# Target column
target_col = "Yield_Strength"

In [4]:
# Drop ONLY the target column from features
feature_cols = [
    c for c in df.columns 
    if c != target_col and not c.endswith('_pct')
]

In [5]:
# Build X and y
X = df[feature_cols].copy()
y = df[target_col].copy()

In [6]:
# Handle missing values
X = X.fillna(X.median(numeric_only=True))
y = y.fillna(y.median())

In [7]:
# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X.values, y.values.reshape(-1,1), test_size=0.2, random_state=42
)

In [8]:
# Scale features and target
x_scaler = StandardScaler()
y_scaler = MinMaxScaler()

X_train_s = x_scaler.fit_transform(X_train)
X_val_s   = x_scaler.transform(X_val)
y_train_s = y_scaler.fit_transform(y_train)
y_val_s   = y_scaler.transform(y_val)

In [9]:
# Save scalers & metadata
joblib.dump({
    "x_scaler": x_scaler,
    "y_scaler": y_scaler,
    "feature_cols": feature_cols
}, "scalers.joblib")

# Save scaled splits
np.savez("data_splits.npz",
         X_train=X_train_s, X_val=X_val_s,
         y_train=y_train_s, y_val=y_val_s)

print(f"X shape: {X_train_s.shape}, y shape: {y_train_s.shape}")
print(f"Features used: {feature_cols}")

X shape: (97, 21), y shape: (97, 1)
Features used: ['Al_frac', 'Co_frac', 'Cr_frac', 'Hf_frac', 'Mo_frac', 'Nb_frac', 'Si_frac', 'Ta_frac', 'Ti_frac', 'V_frac', 'W_frac', 'Zr_frac', 'Density', 'Young_Modulus_ROM', 'Young_Modulus_Exp', 'Testing_Temp', 'Specific_Strength', 'Equilibrium_Conditions_encoded', 'Single_Multiphase_encoded', 'Type_Present_Phases_encoded', 'Tension_Compression_encoded']
