In [1]:
import sqlite3
import pandas as pd

import os, sys
rootpath = os.path.join(os.getcwd(), '.')
sys.path.append(rootpath)
from src.utils import *
# reimport modules if they change
%load_ext autoreload
%autoreload 2

In [2]:
database = './simulations_nu0.2.db'

## Simulations

In [3]:
con = sqlite3.connect(database)
cur = con.cursor()
# create table that does not allow duplicates
cur.execute("""
CREATE TABLE IF NOT EXISTS simulations (
            simulation_id integer PRIMARY KEY,
            N int NOT NULL,
            K int NOT NULL,
            lambda float NOT NULL,
            mu float NOT NULL,
            h float NOT NULL,
            window float NOT NULL,
            seed int NOT NULL,
            raw_file TEXT NOT NULL,
            dataset TEXT NOT NULL
            )
""")
cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS parameters ON simulations (N,K,lambda,mu,h,window,seed);")
# can be dropped with cur.execute("DROP INDEX parameters;") if needs to be redesigned
con.commit()
con.close()

In [4]:
con = sqlite3.connect(database)
simulations = pd.read_sql_query("SELECT * FROM simulations", con)
con.close()
simulations

Unnamed: 0,simulation_id,N,K,lambda,mu,h,window,seed,raw_file,dataset
0,3163,10000,100,0.943766,0.2,1.000000e-06,1.0,1000,/scratch02.local/johannes/projects/sahel_finit...,samples/1.0
1,3164,10000,100,0.943766,0.2,1.000000e-06,10.0,1000,/scratch02.local/johannes/projects/sahel_finit...,samples/10.0
2,3165,10000,100,0.943766,0.2,1.000000e-06,100.0,1000,/scratch02.local/johannes/projects/sahel_finit...,samples/100.0
3,3166,10000,100,0.943766,0.2,1.000000e-06,1000.0,1000,/scratch02.local/johannes/projects/sahel_finit...,samples/1000.0
4,3167,10000,100,0.943766,0.2,1.000000e-06,10000.0,1000,/scratch02.local/johannes/projects/sahel_finit...,samples/10000.0
...,...,...,...,...,...,...,...,...,...,...
29745,39042,10000,100,0.999000,0.2,5.623413e+01,1.0,1009,/scratch02.local/johannes/projects/sahel_finit...,samples/1.0
29746,39043,10000,100,0.999000,0.2,1.000000e+02,1.0,1009,/scratch02.local/johannes/projects/sahel_finit...,samples/1.0
29747,39044,10000,100,0.999684,0.2,1.778279e-03,1.0,1009,/scratch02.local/johannes/projects/sahel_finit...,samples/1.0
29748,39045,10000,100,0.999822,0.2,1.778279e-01,1.0,1009,/scratch02.local/johannes/projects/sahel_finit...,samples/1.0


In [5]:
import numpy as np
import h5py
unique_seeds = np.sort(simulations.seed.unique())
print("simulation seeds: ", unique_seeds)

# check that all simulations have entered the database
unique_lambda = np.sort(simulations["lambda"].unique())
unique_h = np.sort(simulations["h"].unique())
windows = np.sort(simulations["window"].unique())
for seed in unique_seeds:
    for l in unique_lambda:
        for (i,h) in enumerate(unique_h):
            sel = simulations[(simulations["seed"] == seed) & (simulations["lambda"] == l) & (simulations["h"] == h)]
            windows_sel = sel["window"].unique()
            if len(windows_sel) != 5:
                print(f"error for seed = {seed}, lambda = {l}, h = {h} (index={i}) : windows = {windows_sel}")



simulation seeds:  [1000 1001 1002 1003 1004 1005 1006 1007 1008 1009]


## Beta Approximation

In [6]:
# create table that does not allow duplicates
con = sqlite3.connect(database)
cur = con.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS beta_approximations (
            simulation_id INTEGER NOT NULL UNIQUE,
            a float NOT NULL,
            b float NOT NULL,
            loc float NOT NULL,
            scale float NOT NULL,
            FOREIGN KEY (simulation_id) REFERENCES simulations (simulation_id)
            )
""")
con.commit()
con.close()

In [7]:
con = sqlite3.connect(database)
# show state of approximation simulations
beta_approximations = pd.read_sql_query("SELECT * FROM beta_approximations", con)
con.close()
beta_approximations

Unnamed: 0,simulation_id,a,b,loc,scale
0,3894,4.626178e+01,462379.500741,-0.0001,1.0002
1,3895,4.725747e+01,472331.292993,-0.0001,1.0002
2,3896,4.754473e+01,475202.431607,-0.0001,1.0002
3,3897,4.757825e+01,475537.356641,-0.0001,1.0002
4,3898,4.758081e+01,475562.473601,-0.0001,1.0002
...,...,...,...,...,...
29745,37818,3.271282e+03,1.645532,-0.0001,1.0002
29746,37819,1.270118e+04,6.388917,-0.0001,1.0002
29747,37820,8.766071e+04,44.094831,-0.0001,1.0002
29748,37821,6.377461e+05,320.798436,-0.0001,1.0002


In [23]:
# check if approximations are available for all simulation_ids
# for simulation_id in simulations.simulation_id:
#     if np.sum(beta_approximations.simulation_id==simulation_id) != 1:
#         print(f"error: not a unique approximation for simulation_id = {simulation_id} with parameters:\n{simulations[simulations.simulation_id == simulation_id].iloc[0]}")

## Beta Interpolation

In [8]:
con = sqlite3.connect(database)
cur = con.cursor()
# create table that does not allow duplicates
cur.execute("""
CREATE TABLE IF NOT EXISTS beta_interpolations (
            N INTEGER NOT NULL,
            K INTEGER NOT NULL,
            mu FLOAT NOT NULL,
            seed INTEGER NOT NULL,
            filename TEXT NOT NULL
            )
""")
cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS interpolation ON beta_interpolations (N,K,mu,seed);")
con.commit()
con.close()

In [9]:
con = sqlite3.connect(database)
# show state of approximation simulations
beta_interpolations = pd.read_sql_query("SELECT * FROM beta_interpolations", con)
beta_interpolations

Unnamed: 0,N,K,mu,seed,filename
0,10000,100,0.2,1000,./dat/nu=0.2//beta_interpolation_N=10000_K=100...
1,10000,100,0.2,1001,./dat/nu=0.2//beta_interpolation_N=10000_K=100...
2,10000,100,0.2,1002,./dat/nu=0.2//beta_interpolation_N=10000_K=100...
3,10000,100,0.2,1003,./dat/nu=0.2//beta_interpolation_N=10000_K=100...
4,10000,100,0.2,1004,./dat/nu=0.2//beta_interpolation_N=10000_K=100...
5,10000,100,0.2,1005,./dat/nu=0.2//beta_interpolation_N=10000_K=100...
6,10000,100,0.2,1006,./dat/nu=0.2//beta_interpolation_N=10000_K=100...
7,10000,100,0.2,1007,./dat/nu=0.2//beta_interpolation_N=10000_K=100...
8,10000,100,0.2,1008,./dat/nu=0.2//beta_interpolation_N=10000_K=100...
9,10000,100,0.2,1009,./dat/nu=0.2//beta_interpolation_N=10000_K=100...


## Results

In [26]:
# create table that does not allow duplicates
con = sqlite3.connect(database)
cur = con.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS results (
            N INTEGER NOT NULL,
            K INTEGER NOT NULL,
            mu FLOAT NOT NULL,
            seed INTEGER NOT NULL,
            window float NOT NULL,
            sigma float NOT NULL,
            epsilon float NOT NULL,
            filename TEXT NOT NULL
            )
""")
# do not allow duplicates
cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS result ON results (N,K,mu,seed,window,sigma,epsilon);")
con.commit()
con.close()

In [27]:
con = sqlite3.connect(database)
results = pd.read_sql_query("SELECT * FROM results", con)
con.close()
results

Unnamed: 0,N,K,mu,seed,window,sigma,epsilon,filename


In [15]:
# import numpy as np
# windows = 10**np.arange(0,4.2,0.2)
# print("windows: ", windows, f" in total {len(windows)}")

# con = sqlite3.connect(database)
# cur = con.cursor()
# results = pd.read_sql_query("SELECT * FROM results", con)
# wins = results.window.unique()
# print("windows in database: ", wins)
# missing_windows = np.setdiff1d(windows, wins)
# print("missing windows: ", missing_windows)

# # check for results file
# for window in missing_windows:

#     filename = f"./results/nu=0.2//sigma=0.01_epsilon=0.1/N=10000_K=100_mu=0.2/results_simulation_seed=1000_window={window}.txt"
#     if os.path.isfile(filename):
#         pass
#     else:
#         print(f"file {filename} does not exist")

#     # cur.execute(f"""
#     # INSERT INTO results (N,K,mu,seed,window,sigma,epsilon,filename)
#     # VALUES (10000, 100, 0.2, {seed}, {window}, 0.1, 0.1, "{filename}")
#     # """)
#     # con.commit()
# con.close()



windows:  [1.00000000e+00 1.58489319e+00 2.51188643e+00 3.98107171e+00
 6.30957344e+00 1.00000000e+01 1.58489319e+01 2.51188643e+01
 3.98107171e+01 6.30957344e+01 1.00000000e+02 1.58489319e+02
 2.51188643e+02 3.98107171e+02 6.30957344e+02 1.00000000e+03
 1.58489319e+03 2.51188643e+03 3.98107171e+03 6.30957344e+03
 1.00000000e+04]  in total 21
windows in database:  []
missing windows:  [1.00000000e+00 1.58489319e+00 2.51188643e+00 3.98107171e+00
 6.30957344e+00 1.00000000e+01 1.58489319e+01 2.51188643e+01
 3.98107171e+01 6.30957344e+01 1.00000000e+02 1.58489319e+02
 2.51188643e+02 3.98107171e+02 6.30957344e+02 1.00000000e+03
 1.58489319e+03 2.51188643e+03 3.98107171e+03 6.30957344e+03
 1.00000000e+04]
file ./results/nu=0.2//sigma=0.01_epsilon=0.1/N=10000_K=100_mu=0.2/results_simulation_seed=1000_window=1.5848931924611134.txt does not exist
file ./results/nu=0.2//sigma=0.01_epsilon=0.1/N=10000_K=100_mu=0.2/results_simulation_seed=1000_window=3.9810717055349736.txt does not exist
file ./r

## Clean up 

In [16]:
from tqdm import tqdm
import h5py
import numpy as np

know_what_you_do = False
if know_what_you_do:
    missing=0
    database = "./simulations_nu0.2.db"
    con = sqlite3.connect(database)
    cur = con.cursor()
    simulations = pd.read_sql_query("SELECT * FROM simulations", con)
    # check that all simulations have correct windows and datasets
    windows=[1e0,1e1,1e2,1e3,1e4]

    unique_seeds = [1000]
    print("simulation seeds: ", unique_seeds)
    for seed in tqdm(unique_seeds):
        dbseed = simulations[(simulations.seed == seed)]
        # unique lambda values
        unique_lambdas = np.sort(dbseed["lambda"].unique())
        for lam in unique_lambdas:
            dblam = dbseed[(dbseed["lambda"] == lam)]
            # unique h values
            unique_hs = np.sort(dblam["h"].unique())
            for h in unique_hs:
                sim = dblam[(dblam["h"] == h)]
                file_sim = sim["raw_file"].values[0]
                with h5py.File(file_sim, 'r') as f:
                    for window in windows:
                        try:
                            dset_db = sim[sim["window"]==window]["dataset"].values[0]
                        except IndexError: 
                            missing+=1
                            print(f"window {window} not found for seed {seed}, lambda {lam}, h {h}")
                            # check if dataset exists in raw file
                            try:
                                dset_std = f'samples/{window:.1f}'                    
                                data = f[dset_std]
                                print(f"{dset_std} found: {data}")
                                print(f"add to database: {sim.N.values[0]},{sim.K.values[0]},{sim['lambda'].values[0]},{sim.mu.values[0]},{sim.h.values[0]},{window},{sim.seed.values[0]},'{file_sim}','{dset_std}")
                                cur.execute(f"INSERT INTO simulations (N,K,lambda,mu,h,window,seed,raw_file,dataset) VALUES ({sim.N.values[0]},{sim.K.values[0]},{sim['lambda'].values[0]},{sim.mu.values[0]},{sim.h.values[0]},{window},{sim.seed.values[0]},'{file_sim}','{dset_std}')")
                            except KeyError:
                                print(f"window {window} not found in {file_sim}")
    if not missing:
        print("all datasets are set correctly")
    else:
        print(f"{missing} datasets were missing windows")
    con.commit()
    con.close()

In [17]:
# test saving files with
if False:
    database = "./test.db"
    from src.simulation import *

    params={
        'N': int(1e2),
        'K': 10,
        'lambda': 0.0,
        'mu': 0.2,
        'h': 1,
        'seed': 1234,
    }
    result = simulation(params)
    save_simulation(result, path='./logs/', database=database, verbose=True)