## Get Best-Performing Hyper Parameters

In [2]:
!which python

/Users/simon.hughes/anaconda3/envs/phd_py36/bin/python


In [3]:
!pip freeze

absl-py==0.3.0
anaconda-client==1.6.11
appnope==0.1.0
argcomplete==1.9.4
asn1crypto==0.24.0
astor==0.7.1
beautifulsoup4==4.6.0
bleach==2.1.2
boto==2.47.0
boto3==1.5.36
botocore==1.8.50
bz2file==0.98
certifi==2018.1.18
cffi==1.11.4
chardet==3.0.4
clyent==1.2.2
costcla==0.5
cryptography==2.1.4
cycler==0.10.0
cymem==1.31.2
cytoolz==0.8.2
decorator==4.2.1
dicecore==1.13
dill==0.2.8.2
docutils==0.14
entrypoints==0.2.3
ftfy==4.4.3
gast==0.2.0
gensim==0.13.4
grpcio==1.14.0
h5py==2.7.0
hdbscan==0.8.12
html5lib==1.0.1
idna==2.6
ipykernel==4.8.2
ipython==6.2.1
ipython-genutils==0.2.0
ipywidgets==7.1.2
jedi==0.11.1
Jinja2==2.10
jmespath==0.9.3
joblib==0.9.4
jsonschema==2.6.0
jupyter==1.0.0
jupyter-client==5.2.2
jupyter-console==5.2.0
jupyter-core==4.4.0
jupyterlab==0.31.8
jupyterlab-launcher==0.10.5
Keras==1.2.2
Keras-Applications==1.0.4
Keras-Preprocessing==1.0.2
Markdown==2.6.11
MarkupSafe==1.0
matplotlib==2.0.0
mistune==0.8.3
murmurhash==0.26.4
nb-anacondacloud==1.4.0
nb-conda==2.2.1
nb-conda-

In [5]:
import pandas as pd
import pymongo

In [7]:
client = pymongo.MongoClient(serverSelectionTimeoutMS=100, host="127.0.0.1")
db = client.metrics_causal_model_reranker

In [8]:
def get_df(collection, db):
    assert collection.endswith("_VD")
    cols = ["f1_score", "precision", "recall", "asof", "best_top_n", "C", "best_max_parses", 
        #"best_max_upd", 
        "max_update_items", 
        # "best_min_prob", "extractors", 
        "initial_weight", "loss_type",\
        #"min_feat_freq",
        "pa_type"]
    if "INSENS" in collection:
        cols = [c for c in cols if c != "loss_type"]

    elif "PCPTRN" in collection:
        cols = ["f1_score", "precision", "recall", "asof", "best_top_n", "learning_rate", "best_max_parses", 
        #"best_max_upd", 
        "max_update_items", 
        # "best_min_prob", "extractors", 
        "initial_weight"]
        #"min_feat_freq"]
    
    project = {
            "params": "$parameters",
            "micro_f1": "$MICRO_F1",
            "asof": "$asof",
            "_id": 1
        }
    feats_pipeline = [{ "$project": project }]
    rows = [row for row in db[collection].aggregate(feats_pipeline)]

    results = []
    for r in rows:
        d = dict(r["params"])
        d.update(r["micro_f1"])
        d["asof"] = str(r["asof"])
        results.append(d)
    
    df = pd.DataFrame(results)
    df = df.sort_values(by="f1_score",ascending=False)
    return df[cols]

In [9]:
# MONGO_COLLECTION = "SC_RE-RANKER_HYPER_PARAM_VD"
# MONGO_COLLECTION = "SC_COST_INSENS_RE-RANKER_HYPER_PARAM_VD"

## CB

### VD

In [15]:
df_pa = get_df("CB_STR_PCPTRN_RE-RANKER_HYPER_PARAM_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,learning_rate,best_max_parses,max_update_items,initial_weight
29,0.741353,0.782873,0.704015,2019-06-24 20:43:02.777000,1,0.1,300,1,0.01


In [16]:
df_pa = get_df("CB_COST_INSENS_RE-RANKER_HYPER_PARAM_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,pa_type
30,0.737302,0.807826,0.678102,2019-06-23 15:33:17.018000,1,0.1,300,1,0.01,1


In [10]:
df_pa = get_df("CB_RE-RANKER_HYPER_PARAM_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type
37,0.746014,0.777294,0.717153,2019-06-18 01:55:37.752000,2,0.0025,300,1,0.01,ml,1


### Test

In [25]:
df_pa = get_df("TEST_CB_STR_PCPTRN_RE-RANKER_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,learning_rate,best_max_parses,max_update_items,initial_weight
0,0.748162,0.752311,0.744059,2019-06-24 21:06:55.476000,1,0.1,300,1,0.01


In [13]:
df_pa = get_df("TEST_CB_RE-RANKER_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type
0,0.742961,0.738267,0.747715,2019-06-18 07:34:50.950000,2,0.0025,300,1,0.01,ml,1


## SC

### VD

In [17]:
df_pa = get_df("SC_STR_PCPTRN_RE-RANKER_HYPER_PARAM_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,learning_rate,best_max_parses,max_update_items,initial_weight
22,0.810473,0.851438,0.773269,2019-06-28 18:49:08.036000,5,0.5,300,1,0.01


In [18]:
df_pa = get_df("SC_COST_INSENS_RE-RANKER_HYPER_PARAM_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,pa_type
8,0.806197,0.868521,0.752219,2019-06-23 16:19:24.964000,2,0.0005,300,1,0.01,1


In [20]:
df_pa = get_df("SC_RE-RANKER_HYPER_PARAM_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type
41,0.808089,0.856129,0.765153,2019-06-18 10:15:07.357000,3,0.01,300,1,0.01,ml,1


### Test

In [24]:
df_pa = get_df("TEST_SC_STR_PCPTRN_RE-RANKER_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,learning_rate,best_max_parses,max_update_items,initial_weight
0,0.82448,0.868613,0.784615,2019-06-28 19:11:53.297000,1,0.1,300,1,0.01


In [26]:
df_pa = get_df("TEST_SC_RE-RANKER_VD", db)
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type
0,0.827042,0.848555,0.806593,2019-06-18 15:24:03.945000,3,0.01,300,1,0.01,ml,1
