In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

In [4]:
CSV_PATH = "../data/encoded_data.csv"
df = pd.read_csv(CSV_PATH)

In [5]:
# Drop non-numeric and target columns
drop_cols = [
    "Ref", 
    "Composition", 
    "Equilibrium_Conditions", 
    "Single_Multiphase", 
    "Type_Present_Phases", 
    "Tension_Compression", 
    "Yield_Strength"  # target column
]

In [6]:
feature_cols = [c for c in df.columns if c not in drop_cols]
target_col = "Yield_Strength"

In [7]:
# Build X and y
X = df[feature_cols].copy()
y = df[target_col].copy()

In [8]:
# Handle missing values
X = X.fillna(X.median(numeric_only=True))
y = y.fillna(y.median())

In [9]:
# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X.values, y.values.reshape(-1,1), test_size=0.2, random_state=42
)

In [10]:
# Scale features and target
x_scaler = StandardScaler()
y_scaler = MinMaxScaler()

X_train_s = x_scaler.fit_transform(X_train)
X_val_s   = x_scaler.transform(X_val)
y_train_s = y_scaler.fit_transform(y_train)
y_val_s   = y_scaler.transform(y_val)

In [11]:
# Save for later
joblib.dump({"x_scaler": x_scaler, "y_scaler": y_scaler, "feature_cols": feature_cols}, "scalers.joblib")
np.savez("data_splits.npz", X_train=X_train_s, X_val=X_val_s, y_train=y_train_s, y_val=y_val_s)

print(f"X shape: {X_train_s.shape}, y shape: {y_train_s.shape}")
print(f"Features used: {feature_cols}")

X shape: (97, 9), y shape: (97, 1)
Features used: ['Density', 'Young_Modulus_ROM', 'Young_Modulus_Exp', 'Testing_Temp', 'Specific_Strength', 'Equilibrium_Conditions_encoded', 'Single_Multiphase_encoded', 'Type_Present_Phases_encoded', 'Tension_Compression_encoded']
