## Get Best-Performing Hyper Parameters

In [1]:
!which python

/Users/simon.hughes/anaconda3/envs/phd_py36/bin/python


In [2]:
!pip freeze

appnope==0.1.0
attrs==19.1.0
backcall==0.1.0
bleach==3.1.0
boto==2.47.0
boto3==1.9.18
botocore==1.12.18
bz2file==0.98
certifi==2016.2.28
costcla==0.5
cycler==0.10.0
cymem==1.31.2
cytoolz==0.8.2
decorator==4.4.0
defusedxml==0.5.0
dill==0.2.8.2
docutils==0.14
entrypoints==0.3
ftfy==4.4.3
gensim==0.13.4
h5py==2.7.0
html5lib==0.999
ipykernel==5.1.1
ipython==7.6.1
ipython-genutils==0.2.0
ipywidgets==7.4.2
jedi==0.14.0
Jinja2==2.10.1
jmespath==0.9.3
joblib==0.9.4
json5==0.8.4
jsonschema==3.0.1
jupyter==1.0.0
jupyter-client==5.2.4
jupyter-console==6.0.0
jupyter-core==4.4.0
jupyterlab==1.0.1
jupyterlab-server==1.0.0
MarkupSafe==1.1.1
matplotlib==2.0.0
mistune==0.8.4
murmurhash==0.26.4
nbconvert==5.5.0
nbformat==4.4.0
nltk==3.2.2
nose==1.3.7
notebook==5.7.8
numpy==1.15.2
pandas==0.19.2
pandocfilters==1.4.2
parso==0.5.0
pathlib==1.0.1
pexpect==4.7.0
pickleshare==0.7.5
plac==0.9.6
preshed==1.0.1
prometheus-client==0.7.1
prompt-toolkit==2.0.9
ptyprocess==0.6.0
pyea==0.2
Pygments==2.4.2
pymongo==3.

In [3]:
import pandas as pd
import pymongo

In [4]:
client = pymongo.MongoClient(serverSelectionTimeoutMS=100, host="127.0.0.1")
db = client.metrics_causal_model_reranker

In [5]:
import datetime

def fmt_dt(date_time_str):
    dt= datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S.%f')
    return dt.strftime("%m-%d-%Y %H:%M")

def query_collection(collection):
    db = client.metrics_causal_model_reranker
    if "SENT_" in collection:
        db = client.metrics_causal_model_parser
        
    project = {
            "params": "$parameters",
            "micro_f1": "$MICRO_F1",
            "asof": "$asof",
            "_id": 1
        }
    feats_pipeline = [{ "$project": project }]
    rows = [row for row in db[collection].aggregate(feats_pipeline)]
    if len(rows) == 0:
        return pd.DataFrame([])

    results = []
    for r in rows:
        d = dict(r["params"])
        d.update(r["micro_f1"])
        d["asof"] = str(r["asof"])
        results.append(d)
    
    df = pd.DataFrame(results)
    df["extractors"] = df["extractors"].apply(lambda l: ",".join(l))
#     df["asof"] = df["asof"].apply(fmt_dt)
    df = df.sort_values(by="f1_score",ascending=False)
    return df

In [6]:
def get_df(collection):
    assert "_VD" in collection
    cols = ["best_top_n", "C", "best_max_parses", 
        #"best_max_upd", 
        "max_update_items", 
        # "best_min_prob", "extractors", 
        "initial_weight", "loss_type",\
        #"min_feat_freq",
        "pa_type", "early_stopping_iters",]    
    
    if "SENT" in collection:
        cols = []
    elif "PCPTRN" in collection:
        cols = ["best_top_n", "learning_rate", "best_max_parses", 
        #"best_max_upd", 
        "max_update_items", 
        # "best_min_prob", "extractors", 
        "initial_weight","early_stopping_iters",]
        #"min_feat_freq"]
    
    # add common cols
    cols = ["f1_score", "precision", "recall", "asof"] + cols + ["extractors", "num_feats_MEAN"]
    df = query_collection(collection)
    return df[cols]

In [7]:
col = "SC_STR_PCPTRN_RE-RANKER_HYPER_PARAM_VD"
get_df(col).head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,learning_rate,best_max_parses,max_update_items,initial_weight,early_stopping_iters,extractors,num_feats_MEAN
22,0.810473,0.851438,0.773269,2019-06-28 18:49:08.036000,5,0.5,300,1,0.01,2,"num_crels,Inv-,Prob-",50.8


In [8]:
# MONGO_COLLECTION = "SC_RE-RANKER_HYPER_PARAM_VD"
# MONGO_COLLECTION = "SC_COST_INSENS_RE-RANKER_HYPER_PARAM_VD"

## CB

### VD

In [9]:
df_pa = get_df("SENT_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_FIXED_VD")
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,extractors,num_feats_MEAN
0,0.74358,0.758542,0.729197,2019-05-03 14:18:47.905000,"single_words,between_word_features,label_set,t...",27479.6


In [10]:
# df_pa = query_collection("CB_STR_PCPTRN_RE-RANKER_FEATURE_SEL_VD")[["f1_score", "early_stopping_iters", "extractors"]]
# df_pa.head(1)

In [11]:
df_pa = get_df("CB_STR_PCPTRN_RE-RANKER_HYPER_PARAM_VD")
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,learning_rate,best_max_parses,max_update_items,initial_weight,early_stopping_iters,extractors,num_feats_MEAN
29,0.741353,0.782873,0.704015,2019-06-24 20:43:02.777000,1,0.1,300,1,0.01,2,"Prob-,Above-",33.0


In [12]:
df_pa = get_df("CB_PA_RE-RANKER_HYPER_PARAM_VD")
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type,early_stopping_iters,extractors,num_feats_MEAN
8,0.743145,0.760015,0.727007,2019-06-30 15:00:24.686000,2,0.0005,300,1,0.01,None - cost insens,1,1,CREL_,1371.8


In [13]:
df_pa = get_df("CB_RE-RANKER_HYPER_PARAM_VD")
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type,early_stopping_iters,extractors,num_feats_MEAN
37,0.746014,0.777294,0.717153,2019-06-18 01:55:37.752000,2,0.0025,300,1,0.01,ml,1,3,"CREL_,Prob-,CChainStats-",1410.6


### Test

In [14]:
df_pa = get_df("SENT_TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_FIXED_VD")
df_pa.head(10)

Unnamed: 0,f1_score,precision,recall,asof,extractors,num_feats_MEAN
0,0.737027,0.710169,0.765996,2019-05-03 14:19:17.204000,"single_words,between_word_features,label_set,t...",30367.0


In [15]:
df_pa = get_df("TEST_CB_STR_PCPTRN_RE-RANKER_VD")
df_pa.head(10)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,learning_rate,best_max_parses,max_update_items,initial_weight,early_stopping_iters,extractors,num_feats_MEAN
0,0.749771,0.751838,0.747715,2019-07-05 14:21:05.573000,1,0.1,300,1,0.01,2,"Prob-,Above-",33.0


In [16]:
df_pa = get_df("TEST_CB_PA_RE-RANKER_VD")
df_pa.head(10)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type,early_stopping_iters,extractors,num_feats_MEAN
0,0.740741,0.715503,0.767824,2019-07-05 14:40:40.998000,2,0.0005,300,1,0.01,None - cost insens,1,1,CREL_,1504.0


In [17]:
# To improve upon this, you can use early stopping
df_pa = get_df("TEST_CB_RE-RANKER_VD")
df_pa.head(10)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type,early_stopping_iters,extractors,num_feats_MEAN
0,0.741208,0.731317,0.751371,2019-07-05 14:51:25.146000,2,0.0025,300,1,0.01,ml,1,3,"CREL_,Prob-,CChainStats-",1543.0


## SC

### VD

In [18]:
df_pa = get_df("SENT_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM_VD")
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,extractors,num_feats_MEAN
0,0.80962,0.860857,0.764139,2019-05-03 14:27:42.611000,"three_words,between_word_features,size_feature...",26260.0


In [19]:
# cols = ["f1_score", "extractors", "early_stopping_iters"]
# query_collection("SC_STR_PCPTRN_RE-RANKER_FEATURE_SEL_VD_2").head(1)[cols]

In [21]:
df_pa = get_df("SC_STR_PCPTRN_RE-RANKER_HYPER_PARAM_VD")
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,learning_rate,best_max_parses,max_update_items,initial_weight,early_stopping_iters,extractors,num_feats_MEAN
22,0.810473,0.851438,0.773269,2019-06-28 18:49:08.036000,5,0.5,300,1,0.01,2,"num_crels,Inv-,Prob-",50.8


In [22]:
df_pa = get_df("SC_PA_RE-RANKER_HYPER_PARAM_VD")
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type,early_stopping_iters,extractors,num_feats_MEAN
4,0.806789,0.868206,0.753487,2019-06-30 15:13:25.720000,1,0.01,300,1,0.01,None - cost insens,1,6,"num_crels,Inv-",37.0


In [23]:
df_pa = get_df("SC_RE-RANKER_HYPER_PARAM_VD")
df_pa.head(1)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type,early_stopping_iters,extractors,num_feats_MEAN
41,0.808089,0.856129,0.765153,2019-06-18 10:15:07.357000,3,0.01,300,1,0.01,ml,1,1,"CREL_,CChain-,Prob-",1386.4


### Test

In [24]:
df_pa = get_df("SENT_TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM_VD")
df_pa.head(10)

Unnamed: 0,f1_score,precision,recall,asof,extractors,num_feats_MEAN
0,0.827273,0.856471,0.8,2019-05-03 14:28:38.825000,"three_words,between_word_features,size_feature...",28839.0


In [25]:
df_pa = get_df("TEST_SC_STR_PCPTRN_RE-RANKER_VD")
df_pa.head(10)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,learning_rate,best_max_parses,max_update_items,initial_weight,early_stopping_iters,extractors,num_feats_MEAN
0,0.82716,0.845183,0.80989,2019-07-05 13:35:15.128000,5,0.5,300,1,0.01,2,"num_crels,Inv-,Prob-",51.0


In [26]:
df_pa = get_df("TEST_SC_PA_RE-RANKER_VD")
df_pa.head(10)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type,early_stopping_iters,extractors,num_feats_MEAN
0,0.828457,0.866747,0.793407,2019-07-05 14:45:29.204000,1,0.01,300,1,0.01,None - cost insens,1,6,"num_crels,Inv-",37.0


In [27]:
df_pa = get_df("TEST_SC_RE-RANKER_VD")
df_pa.head(10)

Unnamed: 0,f1_score,precision,recall,asof,best_top_n,C,best_max_parses,max_update_items,initial_weight,loss_type,pa_type,early_stopping_iters,extractors,num_feats_MEAN
0,0.826111,0.846597,0.806593,2019-07-05 14:54:53.566000,3,0.01,300,1,0.01,ml,1,1,"CREL_,CChain-,Prob-",1488.0


In [33]:
df_pa = query_collection("CB_STR_PCPTRN_RE-RANKER_FEATURE_SEL_VD")[["f1_score","learning_rate","max_update_items"]]
df_pa.head(1)

Unnamed: 0,f1_score,learning_rate,max_update_items
15,0.740483,0.3,2


In [34]:
df_pa = query_collection("SC_STR_PCPTRN_RE-RANKER_FEATURE_SEL_VD")[["f1_score","learning_rate","max_update_items"]]
df_pa.head(1)

Unnamed: 0,f1_score,learning_rate,max_update_items
19,0.806631,0.3,2
