In [15]:
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import joblib

In [11]:
import os
os.getcwd()

'/Users/ramseyissa/Documents/GitHub/matsci-opt-benchmarks/notebooks'

## Import Data

In [None]:
url_sobol_filter = "https://zenodo.org/record/7513019/files/sobol_probability_filter.csv"
sobol_filter = pd.read_csv(url_sobol_filter)

url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"
sobol_reg = pd.read_csv(url_sobol_reg)



In [6]:
# parse data for target "fba_isna_prob"
fba_isna_prob = sobol_filter['fba_isna_prob']
fba_isna_prob

sobolPF_fba_isna_prob = sobol_filter.drop(['ls_isna_prob','fba_isna_prob'], axis=1)
sobolPF_fba_isna_prob

fba_isna_prob = fba_isna_prob.to_frame()
# fba_isna_prob

Unnamed: 0,fba_isna_prob
0,0.153846
1,0.000000
2,0.000000
3,0.083333
4,0.076923
...,...
41223,0.100000
41224,0.000000
41225,0.000000
41226,0.000000


## define f(x) to calc mae scores 

In [14]:
# arguement for rfr_mae, X_array, y_array, model_name to save model as .pkl
def rfr_mae(X_array, y_array, model_name):
    kf = KFold(n_splits=5, shuffle=True, random_state=13)
    mae_scores = []
    for train_index, test_index in kf.split(X_array):
        X_train, X_test = X_array[train_index], X_array[test_index]
        y_train, y_test = y_array[train_index], y_array[test_index]
        # Define the model
        model = RandomForestRegressor(random_state=13)
        # Fit model
        model.fit(X_train, y_train)
        # predict
        mae_scores.append(mean_absolute_error(y_test, model.predict(X_test)))
        mae = sum(mae_scores) / len(mae_scores)
    joblib.dump(model, model_name)
    return(mae)

## Train/Val on Sobol_Probability_Filter for "fba_isna_prob"

In [12]:
# Create a KFold cross-validation iterator
X_array_fba_isna_prob = sobolPF_fba_isna_prob.to_numpy()
y_array_fba_isna_prob = fba_isna_prob.to_numpy().ravel()


print("Average MAE for fba_isna_prob",rfr_mae(X_array_fba_isna_prob, y_array_fba_isna_prob,fba_isna_prob.pkl))


# Save the model
with open('../models/fba_isna_prob.pkl', 'wb') as f:
    pickle.dump(model, f)

# # Load the model
# with open('path/to/save/model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)



Average MAE for fba_isna_prob 0.04876547325285035


## Train/Val on Sobol_Probability_Filter for "ls_isna_prob"

In [10]:
ls_isna_prob = sobol_filter['ls_isna_prob']
sobolPF_ls_isna_prob = sobol_filter.drop(['ls_isna_prob','fba_isna_prob'], axis=1)

X_array_ls = sobolPF_ls_isna_prob.to_numpy()
y_array_ls = ls_isna_prob.to_numpy().ravel()

print("Average MAE for ls_isna_prob",rfr_mae(X_array_ls, y_array_ls))


Average MAE for ls_isna_prob 0.09105841918934479


## Import sobol_regression Data

In [None]:
# read in sobol_regression.csv
url_sobol_reg = "https://zenodo.org/record/7513019/files/sobol_regression.csv"
sobol_reg = pd.read_csv(url_sobol_reg)

# drop nan values
sobol_reg = sobol_reg.dropna()

#select features 
sobol_reg =sobol_reg[['mu1_div_mu3', 'mu2_div_mu3', 'std1', 'std2', 'std3', 'comp1', 'comp2', 'num_particles', 'safety_factor', 'fba_rank', 'ls_rank', 'fba_time_s_rank', 'ls_time_s_rank']]

#select targets 'fba', 'ls', 'fba_time_s', 'ls_time_s'
fba = sobol_reg[['fba']]
ls = sobol_reg[['ls']]
fba_time_s = sobol_reg[['fba_time_s']]
ls_time_s = sobol_reg[['ls_time_s']]

## target = fba 

In [None]:
X_array_fba = sobol_reg.to_numpy()
y_array_fba = fba.to_numpy().ravel()

print("Average MAE for fba",rfr_mae(X_array_fba, y_array_fba))

## target = ls 

In [None]:
X_array_ls_reg = sobol_reg.to_numpy()
y_array_ls_reg = ls.to_numpy().ravel()

print("Average MAE for ls",rfr_mae(X_array_ls_reg, y_array_ls_reg))