In [1]:
import sqlite3
import pandas as pd

import os, sys
rootpath = os.path.join(os.getcwd(), '.')
sys.path.append(rootpath)
from src.utils import *
# reimport modules if they change
%load_ext autoreload
%autoreload 2

In [44]:
# connect to the database (create file if not exists)
con = sqlite3.connect('./simulations.db')
# create a cursor object
cur = con.cursor()

## Simulations

In [3]:
# drop table if exists
#cur.execute('DROP TABLE IF EXISTS simulations')

In [4]:
# create table that does not allow duplicates
cur.execute("""
CREATE TABLE IF NOT EXISTS simulations (
            simulation_id integer PRIMARY KEY,
            N int NOT NULL,
            K int NOT NULL,
            lambda float NOT NULL,
            mu float NOT NULL,
            h float NOT NULL,
            window float NOT NULL,
            seed int NOT NULL,
            raw_file TEXT NOT NULL,
            dataset TEXT NOT NULL
            )
""")
cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS parameters ON simulations (N,K,lambda,mu,h,window,seed);")
# can be dropped with cur.execute("DROP INDEX parameters;") if needs to be redesigned
con.commit()

In [57]:
# show the simulations
# cur.execute("SELECT * FROM simulations")
# print(cur.fetchall())
simulations = pd.read_sql_query("SELECT * FROM simulations", con)
simulations

Unnamed: 0,simulation_id,N,K,lambda,mu,h,window,seed,raw_file,dataset
0,1,10000,100,0.000000,0.2,1.000000,0.000001,1001,/scratch02.local/johannes/projects/sahel_finit...,samples/1e-06
1,2,10000,100,0.000000,0.2,1.000000,1.000000,1001,/scratch02.local/johannes/projects/sahel_finit...,samples/1.0
2,3,10000,100,0.000000,0.2,1.000000,10.000000,1001,/scratch02.local/johannes/projects/sahel_finit...,samples/10.0
3,4,10000,100,0.000000,0.2,1.000000,100.000000,1001,/scratch02.local/johannes/projects/sahel_finit...,samples/100.0
4,5,10000,100,0.000000,0.2,1.000000,1000.000000,1001,/scratch02.local/johannes/projects/sahel_finit...,samples/1000.0
...,...,...,...,...,...,...,...,...,...,...
12422,12423,10000,100,0.999000,0.2,100.000000,1.000000,1003,/scratch02.local/johannes/projects/sahel_finit...,samples/1.0
12423,12424,10000,100,0.999000,0.2,100.000000,10.000000,1003,/scratch02.local/johannes/projects/sahel_finit...,samples/10.0
12424,12425,10000,100,0.999438,0.2,0.000032,1.000000,1003,/scratch02.local/johannes/projects/sahel_finit...,samples/1.0
12425,12426,10000,100,0.999438,0.2,0.003162,1.000000,1003,/scratch02.local/johannes/projects/sahel_finit...,samples/1.0


In [58]:
import numpy as np
unique_seeds = np.sort(simulations.seed.unique())
print("simulation seeds: ", unique_seeds)

# check that all simulations have entered the database
unique_lambda = np.sort(simulations["lambda"].unique())
unique_h = np.sort(simulations["h"].unique())
for seed in unique_seeds:
    for l in unique_lambda:
        for h in unique_h:
            windows = simulations[(simulations["seed"] == seed) & (simulations["lambda"] == l) & (simulations["h"] == h)]["window"].unique()
            # len(windows) should be 5 or 6 (seed1001)
            if len(windows) != 5:
                if len(windows) == 6 and seed == 1001:
                    continue
                else:
                    print(f"error for seed = {seed}, lambda = {l}, h = {h} : windows = {windows}")

simulation seeds:  [1000 1001 1002 1003]


## Beta Approximation

In [7]:
# create table that does not allow duplicates
cur.execute("""
CREATE TABLE IF NOT EXISTS beta_approximations (
            simulation_id INTEGER NOT NULL UNIQUE,
            a float NOT NULL,
            b float NOT NULL,
            loc float NOT NULL,
            scale float NOT NULL,
            FOREIGN KEY (simulation_id) REFERENCES simulations (simulation_id)
            )
""")
con.commit()

In [8]:
# show state of approximation simulations
beta_approximations = pd.read_sql_query("SELECT * FROM beta_approximations", con)
beta_approximations

Unnamed: 0,simulation_id,a,b,loc,scale
0,1,3.000801e+03,2.072159e+04,-0.0001,1.0002
1,2,6.499563e+03,4.488178e+04,-0.0001,1.0002
2,3,5.995206e+04,4.139902e+05,-0.0001,1.0002
3,4,5.947034e+05,4.106638e+06,-0.0001,1.0002
4,5,5.693645e+06,3.931665e+07,-0.0001,1.0002
...,...,...,...,...,...
12242,11914,5.812214e+03,2.913608e+00,-0.0001,1.0002
12243,11915,1.587837e+04,7.959629e+00,-0.0001,1.0002
12244,11916,1.022290e+05,5.124591e+01,-0.0001,1.0002
12245,11917,7.153305e+05,3.585834e+02,-0.0001,1.0002


## Beta Interpolation

In [15]:
# create table that does not allow duplicates
cur.execute("""
CREATE TABLE IF NOT EXISTS beta_interpolations (
            N INTEGER NOT NULL,
            K INTEGER NOT NULL,
            mu FLOAT NOT NULL,
            seed INTEGER NOT NULL,
            filename TEXT NOT NULL
            )
""")
cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS interpolation ON beta_interpolations (N,K,mu,seed);")
con.commit()

In [59]:
# show state of approximation simulations
beta_interpolations = pd.read_sql_query("SELECT * FROM beta_interpolations", con)
beta_interpolations

Unnamed: 0,N,K,mu,seed,filename
0,10000,100,0.2,1001,./dat/beta_interpolation_N=10000_K=100_mu=0.2_...
1,10000,100,0.2,1000,./dat/beta_interpolation_N=10000_K=100_mu=0.2_...
2,10000,100,0.2,1002,./dat/beta_interpolation_N=10000_K=100_mu=0.2_...


## Results

In [None]:
# create table that does not allow duplicates
cur.execute("""
CREATE TABLE IF NOT EXISTS results (
            N INTEGER NOT NULL,
            K INTEGER NOT NULL,
            mu FLOAT NOT NULL,
            seed INTEGER NOT NULL,
            window float NOT NULL,
            sigma float NOT NULL,
            epsilon float NOT NULL,
            filename TEXT NOT NULL
            )
""")
# do not allow duplicates
cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS result ON results (N,K,mu,seed,window,sigma,epsilon);")
con.commit()

In [60]:
results = pd.read_sql_query("SELECT * FROM results", con)
results

Unnamed: 0,N,K,mu,seed,window,sigma,epsilon,filename
0,10000,100,0.2,1001,1e-06,0.01,0.1,/data.nst/johannes/projects/sahel_finite-obser...
1,10000,100,0.2,1001,1.0,0.01,0.1,/data.nst/johannes/projects/sahel_finite-obser...
2,10000,100,0.2,1001,10.0,0.01,0.1,/data.nst/johannes/projects/sahel_finite-obser...
3,10000,100,0.2,1001,100.0,0.01,0.1,/data.nst/johannes/projects/sahel_finite-obser...
4,10000,100,0.2,1001,1000.0,0.01,0.1,/data.nst/johannes/projects/sahel_finite-obser...
5,10000,100,0.2,1001,10000.0,0.01,0.1,/data.nst/johannes/projects/sahel_finite-obser...


## Clean up 

In [62]:
from tqdm import tqdm
import h5py
import numpy as np

know_what_you_do = False
if know_what_you_do:
    missing=0
    database = "./simulations.db"
    # check that all simulations have correct windows and datasets
    windows=[1e0,1e1,1e2,1e3,1e4]

    unique_seeds = np.sort(simulations.seed.unique())
    print("simulation seeds: ", unique_seeds)
    for seed in tqdm(unique_seeds):
        dbseed = simulations[(simulations.seed == seed)]
        # unique lambda values
        unique_lambdas = np.sort(dbseed["lambda"].unique())
        for lam in unique_lambdas:
            dblam = dbseed[(dbseed["lambda"] == lam)]
            # unique h values
            unique_hs = np.sort(dblam["h"].unique())
            for h in unique_hs:
                sim = dblam[(dblam["h"] == h)]
                file_sim = sim["raw_file"].values[0]
                with h5py.File(file_sim, 'r') as f:
                    for window in windows:
                        try:
                            dset_db = sim[sim["window"]==window]["dataset"].values[0]
                        except IndexError: 
                            missing+=1
                            print(f"window {window} not found for seed {seed}, lambda {lam}, h {h}")
                            # check if dataset exists in raw file
                            try:
                                dset_std = f'samples/{window:.1f}'                    
                                data = f[dset_std]
                                print(f"{dset_std} found: {data}")
                                print(f"add to database: {sim.N.values[0]},{sim.K.values[0]},{sim['lambda'].values[0]},{sim.mu.values[0]},{sim.h.values[0]},{window},{sim.seed.values[0]},'{file_sim}','{dset_std}")
                                cur.execute(f"INSERT INTO simulations (N,K,lambda,mu,h,window,seed,raw_file,dataset) VALUES ({sim.N.values[0]},{sim.K.values[0]},{sim['lambda'].values[0]},{sim.mu.values[0]},{sim.h.values[0]},{window},{sim.seed.values[0]},'{file_sim}','{dset_std}')")
                            except KeyError:
                                print(f"window {window} not found in {file_sim}")
    if not missing:
        print("all datasets are set correctly")
    else:
        print(f"{missing} datasets are missing windows")
    con.commit()

simulation seeds:  [1000 1001 1002 1003]


  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:05<00:00,  1.34s/it]

all datasets are set correctly





In [63]:
con.commit()
con.close()