# Particle Packing Surrogate Model

Here, we train a surrogate model for the particle packing simulations. We capture the
presence of failed simulations, the packing fractions for two different algorithms, and
the corresponding runtimes.

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupKFold
import pandas as pd
import numpy as np
import joblib
from os import path
import json

# attempted use of skl2onnx to convert to onnx failing due to protobuf error
# https://github.com/onnx/onnx/issues/4469

# from skl2onnx import convert_sklearn
# from skl2onnx.common.data_types import FloatTensorType


In [11]:
from pathlib import Path

dummy = False

task_name = "particle_packing"
data_dir = path.join("..", "..", "data", "processed", task_name)
model_dir = path.join("..", "..", "models", task_name)

if dummy:
    model_dir = path.join(model_dir, "dummy")

cv_model_dir = path.join(model_dir, "cv")

Path(model_dir).mkdir(exist_ok=True, parents=True) # technically redundant
Path(cv_model_dir).mkdir(exist_ok=True, parents=True)

## Import Data

In [12]:
sobol_filter = pd.read_csv(path.join(data_dir, "sobol_probability_filter.csv"))
sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))

if dummy:
    data_dir = path.join(data_dir, "dummy")
    sobol_filter = sobol_filter.head(100)
    sobol_reg = sobol_reg.head(100)

Path(data_dir).mkdir(exist_ok=True, parents=True)

## define f(x) to calc mae scores 

In [13]:
# group kfold split for cv; addressing data leakage by using groups
def rfr_group_mae(X_array, y_array, group_array, model_name_stem, random_state=13):
    kf = GroupKFold(n_splits=5)
    mae_scores = []
    y_preds = []
    y_trues = []
    for i, (train_index, test_index) in enumerate(
        kf.split(X_array, y_array, group_array)
    ):
        X_train, X_test = X_array[train_index], X_array[test_index]
        y_train, y_test = y_array[train_index], y_array[test_index]
        y_test = y_test.tolist()

        model = RandomForestRegressor(random_state=random_state)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test).tolist()

        y_preds.append(y_pred)
        y_trues.append(y_test)
        mae = mean_absolute_error(y_test, y_pred)
        mae_scores.append(mae)
        # save model as .pkl with compression
        # https://stackoverflow.com/a/47062881/13697228
        joblib.dump(model, f"{model_name_stem}_{i}.pkl", compress=7)
    avg_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)
    print(f"MAE for {path.basename(model_name_stem)}: {avg_mae:.4f} +/- {std_mae:.4f}")
    results = {"mae": mae_scores, "y_pred": y_preds, "y_true": y_trues}
    return results


## Features

In [14]:
common_features = [
    "mu1",
    "mu2",
    "mu3",
    "std1",
    "std2",
    "std3",
    "comp1",
    "comp2",
    "comp3",
    "num_particles",
    "safety_factor",
]

fba_isna_prob_features = common_features
ls_isna_prob_features = common_features
fba_features = common_features + ["fba_rank"]
ls_features = common_features + ["ls_rank"]
fba_time_s_features = common_features + ["fba_time_s_rank"]
ls_time_s_features = common_features + ["ls_time_s_rank"]


## Probability Filter
### fba

In [15]:
# create groups for Probablity filter using features of sobol_filter
sobol_filter_group = (
    sobol_filter[common_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

In [16]:
## Create a GroupKFold cross-validation iterator

X_array_fba_isna_prob = sobol_filter[fba_isna_prob_features].to_numpy()
y_array_fba_isna_prob = sobol_filter[["fba_isna_prob"]].to_numpy().ravel()

## This is the trained model on As a function of mu1_div_mu3, mu2_div_mu3, std1, std2,
## std3, comp1, comp2, num_particles, safety_factor
## label data = fba_isna_prob

fba_isna_model_stem = path.join(cv_model_dir, "spf_fba_isna_prob")
fba_isna_results = rfr_group_mae(
    X_array_fba_isna_prob,
    y_array_fba_isna_prob,
    sobol_filter_group,
    fba_isna_model_stem,
)


MAE for spf_fba_isna_prob: 0.0430 +/- 0.0003


test loading the pickled model

In [17]:
test_data = X_array_fba_isna_prob[:5]
for i in range(5):
    model = joblib.load(f"{fba_isna_model_stem}_{i}.pkl")
    print(f"{i}: {model.predict(test_data)}")


0: [0.00382576 0.3872015  0.02611088 0.07780826 0.33870147]
1: [0.00738889 0.38749496 0.02029697 0.04099549 0.28201631]
2: [0.00282479 0.38871916 0.02881385 0.18538185 0.36331197]
3: [0.03226923 0.40016508 0.07084854 0.08603914 0.38663271]
4: [0.0085     0.30199426 0.0488271  0.06037933 0.39302256]


### ls

In [18]:
sobolPF_ls_isna_prob = sobol_filter[ls_isna_prob_features]
ls_isna_prob = sobol_filter[["ls_isna_prob"]]

X_array_ls_isna_prob = sobolPF_ls_isna_prob.to_numpy()
y_array_ls_isna_prob = ls_isna_prob.to_numpy().ravel()

ls_isna_model_stem = path.join(cv_model_dir, "spf_ls_isna_prob")
ls_isna_results = rfr_group_mae(
    X_array_ls_isna_prob, y_array_ls_isna_prob, sobol_filter_group, ls_isna_model_stem
)


MAE for spf_ls_isna_prob: 0.0833 +/- 0.0008


## Packing Fraction Models
### fba

In [19]:
sobol_reg_fba = sobol_reg.dropna(subset=["fba"])
X_array_fba = sobol_reg_fba[fba_features].to_numpy()
y_array_fba = sobol_reg_fba["fba"].to_numpy().ravel()

## create group for sobol regression fba features
sobol_reg_fba_group = (
    sobol_reg_fba[common_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)


## GroupKFold split for cv; using groups
fba_model_stem = path.join(cv_model_dir, "sobol_reg_fba")
fba_results = rfr_group_mae(
    X_array_fba, y_array_fba, sobol_reg_fba_group, fba_model_stem
)


MAE for sobol_reg_fba: 0.0042 +/- 0.0000


### ls

In [20]:
sobol_reg_ls = sobol_reg.dropna(subset=["ls"])
X_array_ls = sobol_reg_ls[ls_features].to_numpy()
y_array_ls = sobol_reg_ls["ls"].to_numpy().ravel()

## create group for sobol regression ls features
sobol_reg_ls_group = (
    sobol_reg_ls[common_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

## GroupKFold split for cv; using groups
ls_model_path = path.join(cv_model_dir, "sobol_reg_ls")
ls_results = rfr_group_mae(X_array_ls, y_array_ls, sobol_reg_ls_group, ls_model_path)


MAE for sobol_reg_ls: 0.0081 +/- 0.0001


## Runtime Models
No NaNs in the time values.
### fba_time_s

In [21]:
## create fba_time_s dataframe to use for groups
fba_time_s_df = sobol_reg[fba_time_s_features]

X_array_fba_time_s = sobol_reg[fba_time_s_features].to_numpy()
fba_time_s = sobol_reg[["fba_time_s"]]
y_array_fba_time_s = fba_time_s.to_numpy().ravel()


##create groups for fba_time_s GroupKFOld split
sobol_reg_fba_time_s_group = (
    fba_time_s_df[common_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)

fba_time_s_model_stem = path.join(cv_model_dir, "sobol_reg_fba_time_s")
fba_time_s_results = rfr_group_mae(
    X_array_fba_time_s,
    y_array_fba_time_s,
    sobol_reg_fba_time_s_group,
    fba_time_s_model_stem,
)


MAE for sobol_reg_fba_time_s: 0.0366 +/- 0.0006


### ls_time_s

In [22]:
##create df for ls_time_s
ls_time_s_df = sobol_reg[ls_time_s_features]

##create arrays for model
X_array_ls_time_s = sobol_reg[ls_time_s_features].to_numpy()
ls_time_s = sobol_reg[["ls_time_s"]]
y_array_ls_time_s = ls_time_s.to_numpy().ravel()


##create groups for fba_time_s GroupKFOld split
sobol_reg_ls_time_s_group = (
    ls_time_s_df[common_features]
    .round(6)
    .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
)


ls_time_s_model_stem = path.join(cv_model_dir, "sobol_reg_ls_time_s")
ls_time_s_results = rfr_group_mae(
    X_array_ls_time_s,
    y_array_ls_time_s,
    sobol_reg_ls_time_s_group,
    ls_time_s_model_stem,
)


MAE for sobol_reg_ls_time_s: 44.0490 +/- 2.2337


In [23]:
# reminder where is the data and what is it saving
main_results = {
    "fba_isna_prob": fba_isna_results,
    "ls_isna_prob": ls_isna_results,
    "fba": fba_results,
    "ls": ls_results,
    "fba_time_s": fba_time_s_results,
    "ls_time_s": ls_time_s_results,
}
with open(path.join(data_dir, "model_metadata.json"), "w") as f:
    json.dump(main_results, f)


In [24]:
model_paths = {
    "fba_isna_prob": fba_isna_model_stem,
    "ls_isna_prob": ls_isna_model_stem,
    "fba": fba_model_stem,
    "ls": ls_model_path,
    "fba_time_s": fba_time_s_model_stem,
    "ls_time_s": ls_time_s_model_stem,
}

for i in range(5):
    models = {}
    for key, model_path in model_paths.items():
        models[key] = joblib.load(f"{model_path}_{i}.pkl")

    with open(path.join(cv_model_dir, f"cross_validation_models_{i}.pkl"), "wb") as f:
        joblib.dump(models, f, compress=7)


## Production models (full training data)
Six keys in the dictionary, each key is a value of a label, and its value pair is the trained model.
This trained model is stored in the models folder with the pickle file name "trained_model.pkl"

In [25]:
def train_and_save(
    spf_feat_array,
    sr_feat_array,
    spf_labels_array,
    sr_labels_array,
    spf_label_names,
    sr_label_names,
):
    models = {}

    for X1, y1, name1 in zip(spf_feat_array, spf_labels_array, spf_label_names):
        print(f"X1 spf shape: {X1.shape}, Y1 spf shape: {y1.shape}")
        model = RandomForestRegressor(random_state=13)
        model.fit(X1, y1)
        models[name1] = model

    for X2, y2, name2 in zip(sr_feat_array, sr_labels_array, sr_label_names):
        print(f"X2 sr shape: {X2.shape}, Y2 sr shape: {y2.shape}")
        model = RandomForestRegressor(random_state=13)
        model.fit(X2, y2)
        models[name2] = model

    return models


In [26]:
# List of x_arrays, y_arrays, and target_names
sobol_prob_filter_arrays = [X_array_fba_isna_prob, X_array_ls_isna_prob]
sobol_prob_filter_labels = [y_array_fba_isna_prob, y_array_ls_isna_prob]
sobol_filter_target_names = ["fba_isna_prob", "ls_isna_prob"]

# List of x_arrays, y_arrays, and target_names
sobol_reg_x_arrays = [X_array_fba, X_array_ls, X_array_fba_time_s, X_array_ls_time_s]
sobol_reg_labels = [y_array_fba, y_array_ls, y_array_fba_time_s, y_array_ls_time_s]
sobol_reg_target_names = ["fba", "ls", "fba_time_s", "ls_time_s"]

# Train and save the model on all the data
models = train_and_save(
    sobol_prob_filter_arrays,
    sobol_reg_x_arrays,
    sobol_prob_filter_labels,
    sobol_reg_labels,
    sobol_filter_target_names,
    sobol_reg_target_names,
)

joblib.dump(models, path.join(model_dir, "surrogate_models.pkl"), compress=7)


X1 spf shape: (54784, 11), Y1 spf shape: (54784,)
X1 spf shape: (54784, 11), Y1 spf shape: (54784,)
X2 sr shape: (472857, 12), Y2 sr shape: (472857,)
X2 sr shape: (425513, 12), Y2 sr shape: (425513,)
X2 sr shape: (494498, 12), Y2 sr shape: (494498,)
X2 sr shape: (494498, 12), Y2 sr shape: (494498,)


['..\\..\\models\\particle_packing\\surrogate_models.pkl']