# Particle Packing Surrogate Model

Here, we train a surrogate model for the particle packing simulations. We capture the
presence of failed simulations, the packing fractions for two different algorithms, and
the corresponding runtimes.

In [2]:
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import joblib
from os import path, getcwd
import json

# attempted use of skl2onnx to convert to onnx failing due to protobuf error
# https://github.com/onnx/onnx/issues/4469

# from skl2onnx import convert_sklearn
# from skl2onnx.common.data_types import FloatTensorType

In [3]:
dummy = True

data_dir = path.join("..", "..", "data", "processed", "particle_packing")
model_dir = path.join("..", "..", "models", "particle_packing")

if dummy:
    model_dir = path.join(model_dir, "dummy")

## Import Data

In [4]:
sobol_filter = pd.read_csv(path.join(data_dir, "sobol_probability_filter.csv"))
sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))

if dummy:
    sobol_filter = sobol_filter.head(100)
    sobol_reg = sobol_reg.head(100)

## define f(x) to calc mae scores 

In [5]:
# argument for rfr_mae, X_array, y_array, model_name to save model as .pkl
def rfr_mae(X_array, y_array, model_name_stem, random_state=13):
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    mae_scores = []
    y_preds = []
    y_trues = []
    for i, (train_index, test_index) in enumerate(kf.split(X_array)):
        X_train, X_test = X_array[train_index], X_array[test_index]
        y_train, y_test = y_array[train_index], y_array[test_index]
        y_test = y_test.tolist()

        model = RandomForestRegressor(random_state=random_state)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test).tolist()

        y_preds.append(y_pred)
        y_trues.append(y_test)
        mae = mean_absolute_error(y_test, y_pred)
        mae_scores.append(mae)
        # save model as .pkl
        joblib.dump(model, f"{model_name_stem}_{i}.pkl")
    avg_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)
    print(f"MAE for fba_isna_prob: {avg_mae:.4f} +/- {std_mae:.4f}")
    results = {"mae": mae_scores, "y_pred": y_preds, "y_true": y_trues}
    return results

## Features

In [6]:
common_features = [
    "mu1_div_mu3",
    "mu2_div_mu3",
    "std1",
    "std2",
    "std3",
    "comp1",
    "comp2",
    "num_particles",
    "safety_factor",
]

fba_isna_prob_features = common_features
ls_isna_prob_features = common_features
fba_features = common_features + ["fba_rank"]
ls_features = common_features + ["ls_rank"]
fba_time_s_features = common_features + ["fba_time_s_rank"]
ls_time_s_features = common_features + ["ls_time_s_rank"]

## Probability Filter
### Train/Val on Sobol_Probability_Filter for "fba_isna_prob"

In [7]:
# parse data for target "fba_isna_prob"
sobolPF_fba_isna_prob = sobol_filter[fba_isna_prob_features]
fba_isna_prob = sobol_filter[["fba_isna_prob"]]

In [8]:
# Create a KFold cross-validation iterator
X_array_fba_isna_prob = sobolPF_fba_isna_prob.to_numpy()
y_array_fba_isna_prob = fba_isna_prob.to_numpy().ravel()

# This is the trained model on As a function of mu1_div_mu3, mu2_div_mu3, std1, std2,
# std3, comp1, comp2, num_particles, safety_factor

fba_isna_model_stem = path.join(model_dir, "spf_fba_isna_prob")
fba_isna_results = rfr_mae(
    X_array_fba_isna_prob, y_array_fba_isna_prob, fba_isna_model_stem
)

MAE for fba_isna_prob: 0.0726 +/- 0.0192


test loading the pickled model

In [9]:
test_data = X_array_fba_isna_prob[:5]
for i in range(5):
    model = joblib.load(f"{fba_isna_model_stem}_{i}.pkl")
    print(f"{i}: {model.predict(test_data)}")

0: [0.13195144 0.01562704 0.01174009 0.05571678 0.07210101]
1: [0.1350979  0.         0.01711422 0.00272727 0.05094561]
2: [0.11676185 0.00820513 0.02761111 0.055      0.07425952]
3: [0.15478488 0.02769231 0.01432984 0.05675408 0.08220396]
4: [0.12868454 0.01942308 0.05485354 0.05918803 0.07221212]


### Train/Val on Sobol_Probability_Filter for "ls_isna_prob"

In [10]:
sobolPF_ls_isna_prob = sobol_filter[fba_isna_prob_features]
ls_isna_prob = sobol_filter[["ls_isna_prob"]]

X_array_ls_isna = sobolPF_ls_isna_prob.to_numpy()
y_array_ls_isna = ls_isna_prob.to_numpy().ravel()

ls_isna_model_stem = path.join(model_dir, "spf_ls_isna_prob")
ls_isna_results = rfr_mae(
    X_array_ls_isna, y_array_ls_isna, ls_isna_model_stem
)

MAE for fba_isna_prob: 0.1782 +/- 0.0383


## Packing Fraction Models
### fba

In [11]:
sobol_reg_fba = sobol_reg.dropna(subset=["fba"])
X_array_fba = sobol_reg_fba[fba_features].to_numpy()
y_array_fba = sobol_reg_fba["fba"].to_numpy().ravel()

fba_model_stem = path.join(model_dir, "sobol_reg_fba")
fba_results = rfr_mae(X_array_fba, y_array_fba, fba_model_stem)

MAE for fba_isna_prob: 0.0082 +/- 0.0015


### ls

In [12]:
sobol_reg_ls = sobol_reg.dropna(subset=["ls"])
X_array_ls = sobol_reg_ls[ls_features].to_numpy()
y_array_ls = sobol_reg_ls["ls"].to_numpy().ravel()

ls_model_path = path.join(model_dir, "sobol_reg_ls")
ls_results = rfr_mae(X_array_ls, y_array_ls, ls_model_path)

MAE for fba_isna_prob: 0.0152 +/- 0.0029


## Runtime Models
No NaNs in the time values.
### fba_time_s

In [13]:
X_array_ls_time_s = sobol_reg[ls_time_s_features].to_numpy()
fba_time_s = sobol_reg[["fba_time_s"]]
y_array_fba_time_s = fba_time_s.to_numpy().ravel()

fba_time_s_model_stem = path.join(model_dir, "sobol_reg_fba_time_s")
fba_time_s_results = rfr_mae(
    X_array_ls_time_s, y_array_fba_time_s, fba_time_s_model_stem
)

MAE for fba_isna_prob: 0.0435 +/- 0.0152


### ls_time_s

In [14]:
X_array_ls_time_s = sobol_reg[ls_time_s_features].to_numpy()
ls_time_s = sobol_reg[["ls_time_s"]]
y_array_ls_time_s = ls_time_s.to_numpy().ravel()

ls_time_s_model_stem = path.join(model_dir, "sobol_reg_ls_time_s")
ls_time_s_results = rfr_mae(X_array_ls_time_s, y_array_ls_time_s, ls_time_s_model_stem)

MAE for fba_isna_prob: 2.2572 +/- 0.5052


In [15]:
main_results = {
    "fba_isna_prob": fba_isna_results,
    "ls_isna_prob": ls_isna_results,
    "fba": fba_results,
    "ls": ls_results,
    "fba_time_s": fba_time_s_results,
    "ls_time_s": ls_time_s_results,
}
with open(path.join(data_dir, "model_metadata.json"), "w") as f:
    json.dump(main_results, f)

In [16]:
model_paths = {
    "fba_isna_prob": fba_isna_model_stem,
    "ls_isna_prob": ls_isna_model_stem,
    "fba": fba_model_stem,
    "ls": ls_model_path,
    "fba_time_s": fba_time_s_model_stem,
    "ls_time_s": ls_time_s_model_stem,
}
models = {}
for key, model_path in model_paths.items():
    models[key] = [joblib.load(f"{model_path}_{i}.pkl") for i in range(5)]

with open(path.join(data_dir, "cross_validation_models.pkl"), "wb") as f:
    pickle.dump(models, f)

models

{'fba_isna_prob': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'ls_isna_prob': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'fba': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'ls': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13)],
 'fba_time_s': [RandomForestRegressor(random_state=13),
  RandomForestRegressor(random_state=13),
  RandomFore

## Models Using All Data

In [None]:
# run and save 6 models trained on all the data (not cross-validation) as a single .pkl
# file

### code graveyard 

In [None]:
# print("Average MAE for fba_isna_prob",rfr_mae(X_array_fba_isna_prob, y_array_fba_isna_prob,'fba_isna_prob.pkl'))

# load trained model
# loaded_model = joblib.load('fba_isna_prob_model.pkl')

# Save the model
# with open('../models/fba_isna_prob.pkl', 'wb') as f:
#     pickle.dump(model, f)

# # Load the model
# with open('path/to/save/model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

In [None]:
# url_sobol_filter = "https://zenodo.org/record/7513019/files/sobol_probability_filter.csv"
# sobol_filter = pd.read_csv(url_sobol_filter)

# url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"
# sobol_reg = pd.read_csv(url_sobol_reg)

In [None]:
# os.getcwd()
# os.chdir("../data/raw")

# sobol_filter.to_csv('sobol_filter.csv', index=False)

# sobol_reg.to_csv('sobol_reg.csv', index=False)

In [None]:
# read in sobol_regression.csv
# url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"

In [None]:
# sobol_reg_x = sobol_reg[
#     [
#         "mu1_div_mu3",
#         "mu2_div_mu3",
#         "std1",
#         "std2",
#         "std3",
#         "comp1",
#         "comp2",
#         "num_particles",
#         "safety_factor",
#         "fba_rank",
#         "ls_rank",
#         "fba_time_s_rank",
#         "ls_time_s_rank",
#     ]
# ]

In [None]:
# print(len(sobol_reg_x))
# print(len(fba))

In [None]:
# print(
#     "Average MAE for ls_time_s",
#     rfr_mae(X_array_fba_time_s, y_array_ls_time_s, "sobol_reg_ls_time_s.pkl"),
# )

In [None]:
# # parse data for target "fba_isna_prob"
# fba_isna_prob = sobol_filter["fba_isna_prob"]
# sobolPF_fba_isna_prob = sobol_filter.drop(["ls_isna_prob", "fba_isna_prob"], axis=1)
# fba_isna_prob = fba_isna_prob.to_frame()