# Data Preparation

In [1]:
import pandas as pd
database_name = "crabnet-hyperparameter"
collection_name = "sobol"
df = pd.read_csv(f"../../data/external/{database_name}-{collection_name}.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   _id                305 non-null    object 
 1   N                  305 non-null    int64  
 2   alpha              305 non-null    float64
 3   d_model            305 non-null    int64  
 4   dim_feedforward    305 non-null    int64  
 5   dropout            305 non-null    float64
 6   emb_scaler         305 non-null    float64
 7   eps                305 non-null    float64
 8   epochs_step        305 non-null    int64  
 9   fudge              305 non-null    float64
 10  heads              305 non-null    int64  
 11  k                  305 non-null    int64  
 12  lr                 305 non-null    float64
 13  pe_resolution      305 non-null    int64  
 14  ple_resolution     305 non-null    int64  
 15  pos_scaler         305 non-null    float64
 16  weight_decay       305 non

In [2]:
non_lsgd_df = df[df["lsgd"].isnull()] # there are a few rows with lsgd, drop these
non_lsgd_df = non_lsgd_df.drop(columns=["lsgd", "lsgd_time_s"])
non_lsgd_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 688667 entries, 1 to 688692
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   _id            688667 non-null  object 
 1   mu1_div_mu3    688667 non-null  float64
 2   mu2_div_mu3    688667 non-null  float64
 3   std1           688667 non-null  float64
 4   std2           688667 non-null  float64
 5   std3           688667 non-null  float64
 6   comp1          688667 non-null  float64
 7   comp2          688667 non-null  float64
 8   num_particles  688667 non-null  int64  
 9   fba            641657 non-null  float64
 10  ls             531177 non-null  float64
 11  session_id     688667 non-null  object 
 12  timestamp      688667 non-null  float64
 13  date           688667 non-null  object 
 14  fba_time_s     688464 non-null  float64
 15  ls_time_s      688464 non-null  float64
 16  safety_factor  688277 non-null  float64
 17  runtime        438381 non-nul

In [3]:
seeded_df = non_lsgd_df.dropna(subset=["seed"])
seeded_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 438381 entries, 131337 to 688692
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   _id            438381 non-null  object 
 1   mu1_div_mu3    438381 non-null  float64
 2   mu2_div_mu3    438381 non-null  float64
 3   std1           438381 non-null  float64
 4   std2           438381 non-null  float64
 5   std3           438381 non-null  float64
 6   comp1          438381 non-null  float64
 7   comp2          438381 non-null  float64
 8   num_particles  438381 non-null  int64  
 9   fba            408512 non-null  float64
 10  ls             338473 non-null  float64
 11  session_id     438381 non-null  object 
 12  timestamp      438381 non-null  float64
 13  date           438381 non-null  object 
 14  fba_time_s     438381 non-null  float64
 15  ls_time_s      438381 non-null  float64
 16  safety_factor  438381 non-null  float64
 17  runtime        438381 no

In [4]:
session_df = seeded_df.query("session_id == '20a0e4c9-3752-4489-aa5f-8471294eaa32'")

In [5]:
data_df = session_df.loc[
    :,
    [
        "_id",
        "timestamp",
        "session_id",
        "seed",
        "mu1_div_mu3",
        "mu2_div_mu3",
        "std1",
        "std2",
        "std3",
        "comp1",
        "comp2",
        "num_particles",
        "safety_factor",
        "fba",
        "ls",
        "fba_time_s",
        "ls_time_s",
        # "runtime",
    ],
]
data_df.to_csv("../../data/interim/particle_packing/sobol_data.csv", index=False)
data_df.describe()

Unnamed: 0,timestamp,seed,mu1_div_mu3,mu2_div_mu3,std1,std2,std3,comp1,comp2,num_particles,safety_factor,fba,ls,fba_time_s,ls_time_s
count,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,408504.0,338467.0,438371.0,438371.0
mean,1671326000.0,10.0,1.002737,1.004056,0.397969,0.689786,0.568981,0.334133,0.333074,554.123455,1.75148,0.591582,0.712522,0.164702,54.800375
std,54145.41,0.0,0.381763,0.382134,0.212243,0.215117,0.259101,0.232016,0.232571,260.40096,0.434925,0.071359,0.027891,0.261284,1430.529681
min,1671245000.0,10.0,0.333334,0.333349,0.100012,0.104506,0.100002,7e-06,9e-06,100.0,1.000047,0.388068,0.565447,0.009212,0.004444
25%,1671276000.0,10.0,0.678216,0.67797,0.218908,0.535812,0.347242,0.137148,0.138687,330.0,1.372535,0.525877,0.692809,0.042939,4.101989
50%,1671325000.0,10.0,1.001174,1.001963,0.358677,0.723206,0.581159,0.30012,0.293431,555.0,1.750803,0.606182,0.712668,0.089258,9.58036
75%,1671368000.0,10.0,1.332146,1.337341,0.546662,0.87239,0.793985,0.502477,0.494366,780.0,2.133496,0.657331,0.731178,0.168403,17.776289
max,1671486000.0,10.0,1.666616,1.666635,0.998315,0.999999,0.999995,0.99614,0.992854,1000.0,2.5,0.721711,0.856982,9.806562,70077.269537


In [6]:
filter_df = data_df.copy()
filter_df[["fba_isna", "ls_isna"]] = filter_df.loc[:, ["fba", "ls"]].isna()
by_lbls = [
    "mu1_div_mu3",
    "mu2_div_mu3",
    "std1",
    "std2",
    "std3",
    "comp1",
    "comp2",
    "num_particles",
    "safety_factor",
]
# nan_grp_df["fba_isna"].agg(['sum', 'count'])
# nan_grp_df["ls_isna"].agg(['sum', 'count'])

filter_df = filter_df.round(decimals=6).groupby(by=by_lbls, as_index=False).agg(["sum", "count"])
filter_df.columns = filter_df.columns.map("_".join)
# nan_filter_df.columns = [f"{x}_{y}" for x, y in nan_filter_df.columns.to_flat_index()]
filter_df = filter_df.reset_index()
filter_df["fba_isna_prob"] = filter_df["fba_isna_sum"] / filter_df["_id_count"]
filter_df["ls_isna_prob"] = filter_df["ls_isna_sum"] / filter_df["_id_count"]
filter_df = filter_df[by_lbls + ["fba_isna_prob", "ls_isna_prob"]]
filter_df.to_csv(
    "../../data/processed/particle_packing/sobol_probability_filter.csv", index=False
)
filter_df.describe()

Unnamed: 0,mu1_div_mu3,mu2_div_mu3,std1,std2,std3,comp1,comp2,num_particles,safety_factor,fba_isna_prob,ls_isna_prob
count,41228.0,41228.0,41228.0,41228.0,41228.0,41228.0,41228.0,41228.0,41228.0,41228.0,41228.0
mean,1.003,1.003244,0.398117,0.689824,0.568839,0.334435,0.332847,554.380955,1.751886,0.068009,0.227894
std,0.381764,0.382115,0.212424,0.215219,0.259107,0.232119,0.232645,260.380738,0.435069,0.12875,0.253611
min,0.333334,0.333349,0.100012,0.104506,0.100002,7e-06,9e-06,100.0,1.000047,0.0,0.0
25%,0.67842,0.677384,0.218849,0.5358,0.34695,0.137397,0.138418,330.0,1.372897,0.0,0.0
50%,1.001143,1.000164,0.358699,0.72333,0.581099,0.300304,0.293119,556.0,1.751121,0.0,0.142857
75%,1.332553,1.336608,0.54695,0.872513,0.793886,0.502822,0.493794,780.0,2.134084,0.090909,0.375
max,1.666616,1.666635,0.998315,0.999999,0.999995,0.99614,0.992854,1000.0,2.5,1.0,1.0


In [64]:
import plotly.express as px

fig = px.histogram(
    filter_df[["fba_isna_prob", "ls_isna_prob"]],
    opacity=0.5,
    barmode="overlay",
    histnorm='probability',
    nbins=50,
    width=400,
    height=400,
)
fig.update_layout(
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99, title_text="method"),
    margin=dict(l=20, r=20, t=20, b=20),
)
# rename fba_isna_prob to fba and ls_isna_prob to ls
fig.update_traces(name="fba", selector=dict(name="fba_isna_prob"))
fig.update_traces(name="ls", selector=dict(name="ls_isna_prob"))
# rename x-axis to probability
fig.update_xaxes(title_text="probability of failure")
# rename y-axis to number of simulations
fig.update_yaxes(title_text="normalized simulation counts")
# increase the font size
fig.update_layout(font_size=16)
# save the figure
fig.write_image("../../reports/particle_packing/figures/sobol_probability_filter.png")
# save html
fig.write_html("../../reports/particle_packing/figures/sobol_probability_filter.html")
fig


In [65]:
grp = data_df.round(decimals=6).groupby(
    by=by_lbls,
    dropna=False,
    as_index=False,
)

In [74]:
counts = grp.count()["session_id"]
fig = px.histogram(counts, x="session_id", nbins=50, width=400, height=400)
# tighten margins
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig.update_xaxes(title_text="number of successful repeats")
fig.update_yaxes(title_text="number of groups")
fig.update_layout(font_size=16)
# save figure
fig.write_image("../../reports/particle_packing/figures/sobol_repeats_histogram.png")
# save html
fig.write_html("../../reports/particle_packing/figures/sobol_repeats_histogram.html")
fig

In [8]:
rank_df = grp.rank(method="dense", pct=True).loc[
    :, ["fba", "ls", "fba_time_s", "ls_time_s"]
]
rank_df.columns = [f"{col}_rank" for col in rank_df.columns]


In [None]:
rank_df.head(10)

In [10]:
data_df.shape

(438371, 17)

In [12]:
regression_df = pd.concat((data_df, rank_df), axis=1)
# reorder columns
regression_df = regression_df.loc[
    :,
    [
        "_id",
        "timestamp",
        "session_id",
        "seed",
        "mu1_div_mu3",
        "mu2_div_mu3",
        "std1",
        "std2",
        "std3",
        "comp1",
        "comp2",
        "num_particles",
        "safety_factor",
        "fba_rank",
        "ls_rank",
        "fba_time_s_rank",
        "ls_time_s_rank",
        "fba",
        "ls",
        "fba_time_s",
        "ls_time_s",
        # "runtime",
    ],
]

regression_df.to_csv(
    "../../data/processed/particle_packing/sobol_regression.csv", index=False
)


In [13]:
regression_df.describe()

Unnamed: 0,timestamp,seed,mu1_div_mu3,mu2_div_mu3,std1,std2,std3,comp1,comp2,num_particles,safety_factor,fba_rank,ls_rank,fba_time_s_rank,ls_time_s_rank,fba,ls,fba_time_s,ls_time_s
count,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,438371.0,408504.0,338467.0,438371.0,438371.0,408504.0,338467.0,438371.0,438371.0
mean,1671326000.0,10.0,1.002737,1.004056,0.397969,0.689786,0.568981,0.334133,0.333074,554.123455,1.75148,0.550473,0.560333,0.547007,0.547013,0.591582,0.712522,0.164702,54.800375
std,54145.41,0.0,0.381763,0.382134,0.212243,0.215117,0.259101,0.232016,0.232571,260.40096,0.434925,0.287405,0.287806,0.287473,0.28748,0.071359,0.027891,0.261284,1430.529681
min,1671245000.0,10.0,0.333334,0.333349,0.100012,0.104506,0.100002,7e-06,9e-06,100.0,1.000047,0.066667,0.066667,0.066667,0.066667,0.388068,0.565447,0.009212,0.004444
25%,1671276000.0,10.0,0.678216,0.67797,0.218908,0.535812,0.347242,0.137148,0.138687,330.0,1.372535,0.3,0.307692,0.3,0.3,0.525877,0.692809,0.042939,4.101989
50%,1671325000.0,10.0,1.001174,1.001963,0.358677,0.723206,0.581159,0.30012,0.293431,555.0,1.750803,0.545455,0.555556,0.545455,0.545455,0.606182,0.712668,0.089258,9.58036
75%,1671368000.0,10.0,1.332146,1.337341,0.546662,0.87239,0.793985,0.502477,0.494366,780.0,2.133496,0.8,0.8,0.8,0.8,0.657331,0.731178,0.168403,17.776289
max,1671486000.0,10.0,1.666616,1.666635,0.998315,0.999999,0.999995,0.99614,0.992854,1000.0,2.5,1.0,1.0,1.0,1.0,0.721711,0.856982,9.806562,70077.269537


In [14]:
regression_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 438371 entries, 133446 to 688692
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   _id              438371 non-null  object 
 1   timestamp        438371 non-null  float64
 2   session_id       438371 non-null  object 
 3   seed             438371 non-null  float64
 4   mu1_div_mu3      438371 non-null  float64
 5   mu2_div_mu3      438371 non-null  float64
 6   std1             438371 non-null  float64
 7   std2             438371 non-null  float64
 8   std3             438371 non-null  float64
 9   comp1            438371 non-null  float64
 10  comp2            438371 non-null  float64
 11  num_particles    438371 non-null  int64  
 12  safety_factor    438371 non-null  float64
 13  fba_rank         408504 non-null  float64
 14  ls_rank          338467 non-null  float64
 15  fba_time_s_rank  438371 non-null  float64
 16  ls_time_s_rank   438371 non-null 

In [None]:
fig = px.histogram(
    regression_df[["fba", "ls"]], opacity=0.5, barmode="overlay", width=400, height=400
)
fig.update_layout(
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, title_text="method"),
    margin=dict(l=20, r=20, t=20, b=20),
)
fig.update_xaxes(title_text="packing fraction")
fig.update_layout(font_size=16)
fig.write_image(
    "../../reports/particle_packing/figures/sobol_packing_fraction_histogram.png"
)
fig.write_html(
    "../../reports/particle_packing/figures/sobol_packing_fraction_histogram.html"
)
fig


In [81]:
total_time_s = regression_df[["fba_time_s", "ls_time_s"]].sum().sum()
print(f"total time: {total_time_s / 3600 / 24:.2f} days")

total time: 278.88 days
