Simple Random Sampling

In [2]:
import pickle
import pandas as pd
from sfmmol import NBGraphMaker, SamplerRunner, RandomSampleGrph

with open("data/processed/d_mols.pkl", "rb") as f:
    (_dmy, ns, _dmy, _dmy, _dmy, _dmy, _dmy, _dmy) = pickle.load(f)

(thresh, nb_grph) = NBGraphMaker.load("data/processed/d_nb_grph.pkl")
algo = RandomSampleGrph(random_seed=0)
runner = SamplerRunner(algo, nb_grph, ns, level_cvrg=2, first_n=10)
ret = runner.run(nrun=10)

# 0: 147, 1: 5573, 9: 2809
df = pd.DataFrame([(val["idx"],ns[val["idx"]],)+(ret[idx]['cvrg1'],ret[idx]['cvrg2']) for idx,val in enumerate(ret)], columns=["index", "name", "cvrg1", "cvrg2"])
display(df)

nb_file: data/processed/d_nb_grph.pkl


Unnamed: 0,index,name,cvrg1,cvrg2
0,147,02_6409,0.005574,0.061824
1,5573,04_32484,0.005912,0.064358
2,3411,04_76888,0.010135,0.072973
3,1828,03_673,0.01098,0.078716
4,1765,01_7863,0.011655,0.080068
5,5303,04_1201104,0.012669,0.084966
6,4876,04_3759117,0.013345,0.087331
7,2702,01_79305,0.014527,0.090203
8,4604,01_76447,0.015034,0.091892
9,2809,01_6441,0.015709,0.09223


Stratified Random Sampling

In [3]:
import pickle
from sfmmol import NBGraphMaker, SamplerRunner, StratifiedRandomSampleGrph

with open("data/processed/d_mols.pkl", "rb") as f:
    (_dmy, ns, _dmy, _dmy, _dmy, _dmy, _dmy, _dmy) = pickle.load(f)

(thresh, nb_grph) = NBGraphMaker.load("data/processed/d_nb_grph.pkl")
strada = {"^01_": 1699, "^02_": 1700, "^03_": 717, "^04_": 1699, "^05_": 103}
algo = StratifiedRandomSampleGrph(strada=strada, ns=ns, random_seed=0)
runner = SamplerRunner(algo, nb_grph, ns, level_cvrg=2, first_n=10)
ret = runner.run(nrun=10)

# 0: 1765, 1: 2702, 9: 4876
df = pd.DataFrame([(val["idx"],ns[val["idx"]],)+(ret[idx]['cvrg1'],ret[idx]['cvrg2']) for idx,val in enumerate(ret)], columns=["index", "name", "cvrg1", "cvrg2"])
display(df)

nb_file: data/processed/d_nb_grph.pkl
[0.2870902331868875, 0.5743494423791822, 0.6955052382561676, 0.982595471443055, 1.0]


Unnamed: 0,index,name,cvrg1,cvrg2
0,1765,01_7863,0.003378,0.031757
1,2702,01_79305,0.004561,0.034628
2,4604,01_76447,0.005068,0.036318
3,147,02_6409,0.007939,0.067736
4,2809,01_6441,0.008615,0.068074
5,3287,01_1268265,0.008953,0.068412
6,5573,04_32484,0.009291,0.070946
7,3411,04_76888,0.013514,0.079561
8,5303,04_1201104,0.014527,0.084459
9,4876,04_3759117,0.015203,0.086824


Stratified Random Sampling (KNN)

In [4]:
import pickle
import collections
from sfmmol import NBGraphMaker, SamplerRunner, StratifiedRandomSampleGrph

with open("data/processed/d_mols.pkl", "rb") as f:
    (_dmy, ns, _dmy, _dmy, _dmy, _dmy, _dmy, _dmy) = pickle.load(f)

(thresh, nb_grph) = NBGraphMaker.load("data/processed/d_nb_grph.pkl")
with open("data/processed/d_labels.pkl", "rb") as f:
    mol_strada = pickle.load(f)
    ns_strada = [f"{_stratum}_{_name}" for _stratum, _name in zip(mol_strada.labels_, ns)]
    strada = dict(
        [(f"^{key}_", val) for key, val in dict(sorted(dict(collections.Counter(mol_strada.labels_)).items())).items()]
    )

algo = StratifiedRandomSampleGrph(strada=strada, ns=ns_strada, random_seed=0)
runner = SamplerRunner(algo, nb_grph, ns, level_cvrg=2, first_n=10)
ret = runner.run(nrun=10)

# 0: 147, 1: 796, 9: 1227
df = pd.DataFrame([(val["idx"],ns[val["idx"]],)+(ret[idx]['cvrg1'],ret[idx]['cvrg2']) for idx,val in enumerate(ret)], columns=["index", "name", "cvrg1", "cvrg2"])
display(df)

nb_file: data/processed/d_nb_grph.pkl
[0.14476351351351352, 0.2623310810810811, 0.5608108108108109, 0.8091216216216216, 1.0]


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,index,name,cvrg1,cvrg2
0,147,02_6409,0.005574,0.061824
1,796,02_20669,0.009797,0.076014
2,1828,03_673,0.010642,0.081757
3,5573,04_32484,0.01098,0.084291
4,1765,01_7863,0.011655,0.085642
5,1608,02_345110,0.014865,0.099155
6,2702,01_79305,0.016047,0.102027
7,1708,02_19692,0.01723,0.111486
8,1407,02_2734080,0.018074,0.113007
9,1227,02_319954,0.019257,0.115541


SFMMOL

In [5]:
import pickle
import pandas as pd
from sfmmol import NBGraphMaker, SamplerRunner, GreedyAlgoGrph, SOFGrph

with open("data/processed/d_mols.pkl", "rb") as f:
    (_dmy, ns, _dmy, _dmy, _dmy, _dmy, _dmy, _dmy) = pickle.load(f)

(thresh, nb_grph) = NBGraphMaker.load("data/processed/d_nb_grph.pkl")

algo = GreedyAlgoGrph(SOFGrph(lmd=1.0))
runner = SamplerRunner(algo, nb_grph, ns, level_cvrg=2, first_n=10)
ret = runner.run(nrun=10)

# 0: 147, 1: 796, 9: 1227
df = pd.DataFrame([(val["idx"],ns[val["idx"]],)+(ret[idx]['cvrg1'],ret[idx]['cvrg2']) for idx,val in enumerate(ret)], columns=["index", "name", "cvrg1", "cvrg2"])
display(df)

nb_file: data/processed/d_nb_grph.pkl
[GreedyAlgoGrph] use speed-up trick for LMD(1.0) is large > 0.1


Unnamed: 0,index,name,cvrg1,cvrg2
0,49,02_263,0.026689,0.104054
1,2125,04_263,0.047635,0.146791
2,115,02_6115,0.061993,0.193581
3,3024,04_7813,0.07348,0.219932
4,44,02_8007,0.084628,0.246791
5,619,02_7469,0.094595,0.265878
6,1941,04_7852,0.104054,0.279561
7,2921,01_7468,0.111824,0.28277
8,209,02_244,0.119257,0.29527
9,3053,04_2879,0.126689,0.304223
