In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pymongo
from pprint import pprint

In [2]:
client = pymongo.MongoClient(host="127.0.0.1")
db = client.metrics_causal_model

In [3]:
def group_by(df, bycols, agg_map):
    """

    @param df:      DataFrame
    @param bycols:  str or list
                        Column(s) to group by
    @param agg_map: dictionary or list of 2-tuples
                        Mapping from column to aggregate function e.g. [("city", "count"), ("salary", "mean"]
    @return:        DataFrame
                        Flattened dataframe, with multi-level index removed
    """
    grps = []
    if type(bycols) == str:
        bycols = [bycols]

    if type(agg_map) == dict:
        agg_map = agg_map.items()

    for k,v in agg_map:
        grp = df[bycols + [k]].groupby(bycols, ).agg(v)
        grp.reset_index(inplace=True)
        grp["%s(%s)" % (v,k)] = grp[k]
        del grp[k]
        grps.append(grp)

    m = grps[0]
    for grp in grps[1:]:
        m = pd.merge(m, grp, on=bycols, how="inner")
    return m

In [4]:
from bson.son import SON # needed to ensure dictionary is ordered (python default is not)
import hashlib

def hash_feats(fts):
    vals = fts.values
    joined = "|".join(map(lambda s: str(s),vals)).encode('utf-8') 
    return hashlib.sha224(joined).hexdigest()

In [5]:
from Metrics import rpf1a_from_tp_fp_tn_fn
from collections import defaultdict

def tally_counts(r, filter):
    tally = defaultdict(int)
    for k,v in r.items():
        if filter(k):
            for prop in "tp,tn,fp,fn".split(","):
                tally[prop] += v[prop]
    return tally

def get_causal_relation_metrics(collection, params, include_concept_codes=True):
    dicts = []
    for r in db[collection].find({}):
        d = {}
        cr_counts = tally_counts(r, lambda c: "->" in c)
        (rec, p, cr_f1, a) = rpf1a_from_tp_fp_tn_fn(cr_counts["tp"],cr_counts["fp"],cr_counts["tn"],cr_counts["fn"])
        d["cr_micro_f1"] = cr_f1
        d["cr_micro_rec"]  = rec
        d["cr_micro_prec"] = p
        if include_concept_codes:
            concept_counts = tally_counts(r, lambda c: c[0].isdigit())
            (rec, p, concept_f1, a) = rpf1a_from_tp_fp_tn_fn(concept_counts["tp"],concept_counts["fp"],concept_counts["tn"],concept_counts["fn"])
            d["concept_micro_f1"] = concept_f1
            d["concept_micro_rec"]  = rec
            d["concept_micro_prec"] = p
        parms = r["parameters"]
        for p in params:
            d[p] = parms[p]
        dicts.append(d)
    df = pd.DataFrame(dicts)
    fields = ("cr_micro_f1,cr_micro_rec,cr_micro_prec,concept_micro_f1,concept_micro_rec,concept_micro_prec," + ",".join(params)).split(",")
    if not include_concept_codes:
        fields = [f for f in fields if "concept" not in f]
    return df[fields].sort_values("cr_micro_f1", ascending=False)

In [6]:
def round_data(df, places=3):
    df_copy = df.copy()
    fmt_str = "{0:." + str(places) + "f}"
    cols = set([v for v in df_copy.columns.values if "micro_" in v])
    for c in cols:
        df_copy[c] = df[c].apply(lambda d: fmt_str.format(d))  
    return df_copy

# Feature Selection on Shift Reduce Parser

In [7]:
# REMOVE the non generic parameters, like window size
def get_df_sorted_by_f1score_generic(collection, params=None, filter_cols=True):
    if not params:
        params = []
    if type(params) == str:
        params = params.split(",")
    
    project = {
            "weighted_f1_score":"$WEIGHTED_MEAN_CONCEPT_CODES.f1_score",
            "micro_f1_score":  "$MICRO_F1.f1_score",
            "micro_recall":    "$MICRO_F1.recall",
            "micro_precision": "$MICRO_F1.precision",
    
    # PARAMETERS            
            "feats":          "$parameters.extractors",
            
            "asof" :          "$asof",
            "_id":1
    }
    
    # No count for HMM
    if "_hmm" in collection.lower():
        del project["count"]
    
    for param in params:
        project[param] = "$parameters." + param

    feats_pipeline = [{
        "$project": project
    },
    {
        "$match":{
            "micro_f1_score": { "$exists" : True }        
        }
    },
    {
        "$sort":{
            "micro_f1_score": -1
        }
    },
    ]
    
    rows = [row for row in db[collection].aggregate(feats_pipeline)]
    df = pd.DataFrame(rows).sort_values("micro_f1_score", ascending=False)
    if params:
        df["hs_params"] = df[params].apply(hash_feats, axis=1)
        
    if filter_cols:
        cols = ["micro_f1_score", "micro_recall" ,"micro_precision" ] + params
        return df[cols]
    return df

### Top Result (Best VD Accuracy

#### CB

In [8]:
col = ["CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD"]

rows = []
for coll in col:
    df = get_df_sorted_by_f1score_generic(coll, "")
    dct = df.iloc[0].to_dict()
    dct["Algo"] = coll
    rows.append(dct)

df=pd.DataFrame(rows)
df.sort_values("micro_f1_score", ascending=False)

Unnamed: 0,Algo,micro_f1_score,micro_precision,micro_recall
0,CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARA...,0.748462,0.791057,0.710219


#### SC

In [9]:
col = ["CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD"]

rows = []
for coll in col:
    df = get_df_sorted_by_f1score_generic(coll, "")
    dct = df.iloc[0].to_dict()
    dct["Algo"] = coll
    rows.append(dct)

df=pd.DataFrame(rows)
df.sort_values("micro_f1_score", ascending=False)

Unnamed: 0,Algo,micro_f1_score,micro_precision,micro_recall
0,CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARA...,0.810981,0.841144,0.782906


## Results Sorted by Hyper Params

#### CB

In [10]:
def extract_parameter(s, param_name):
    s = s.replace("("," ").replace(")"," ")
    keys = s.split(" ")
    return [(key,val.replace(",","").replace("'","")) for key,val in [k.split("=") for k in keys if "=" in k] if key == param_name][0][-1]

extract_c_val = lambda s: extract_parameter(s, "C")
extract_penalty_val = lambda s: extract_parameter(s, "penalty")
extract_dual_val = lambda s: extract_parameter(s, "dual")

s = "LogisticRegression(C=0.1, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)"
extract_c_val(s)

'0.1'

In [11]:
def get_hyper_param_results(collection):
    df_feat_sel = get_df_sorted_by_f1score_generic(collection, "algorithm,beta,max_epochs,num_feats_MEAN", filter_cols=True) 
    df_feat_sel["C"] = df_feat_sel["algorithm"].apply(extract_c_val)
    df_feat_sel["penalty"] = df_feat_sel["algorithm"].apply(extract_penalty_val)
    df_feat_sel["dual"] = df_feat_sel["algorithm"].apply(extract_dual_val)
    del df_feat_sel["algorithm"]
    return df_feat_sel

#### Validation

In [12]:
"""
micro_f1_score	micro_recall	micro_precision	beta	max_epochs	num_feats_MEAN	C	penalty	dual
0	0.748462	0.710219	0.791057	0.5	10	94761.2	0.1	l2	True
"""
col = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD"
top_row = get_hyper_param_results(col).iloc[0]
get_hyper_param_results(col).head(5) 

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,beta,max_epochs,num_feats_MEAN,C,penalty,dual
0,0.748462,0.710219,0.791057,0.5,10,94761.2,0.1,l2,True
1,0.748237,0.716423,0.783008,0.3,3,82068.4,0.1,l2,False
2,0.748174,0.710219,0.790414,0.5,20,96791.0,0.1,l2,True
3,0.747986,0.711679,0.788197,0.1,15,98675.6,0.1,l2,False
4,0.747986,0.711679,0.788197,0.3,20,98614.4,0.1,l2,True


#### Training

In [13]:
col_td = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_TD"
df = get_hyper_param_results(col_td)
df = df[(df.beta == top_row.beta) & (df.max_epochs == top_row.max_epochs) & (df.C == top_row.C)]
df = df[(df.penalty == top_row.penalty) & (df.dual == top_row.dual)]
df

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,beta,max_epochs,num_feats_MEAN,C,penalty,dual
288,0.837738,0.77938,0.905544,0.5,10,94761.2,0.1,l2,True


#### SC

#### Validation

In [17]:
"""
micro_f1_score	micro_recall	micro_precision	beta	max_epochs	num_feats_MEAN	C	penalty	dual
0	0.813308	0.805985	0.820764	1.00	3	55036.8	0.5	l2	False
"""
col = "CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD"
top_row = get_hyper_param_results(col).iloc[0]
get_hyper_param_results(col).head(15) 

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,beta,max_epochs,num_feats_MEAN,C,penalty,dual
0,0.810981,0.782906,0.841144,0.3,5,64879.2,0.5,l2,True
1,0.810255,0.7895,0.83213,0.75,5,69780.4,0.1,l2,True
2,0.809567,0.785443,0.835221,0.5,10,73494.0,0.1,l2,True
3,0.80874,0.779102,0.840722,0.5,5,64841.0,0.5,l2,True
4,0.808233,0.786711,0.830967,0.75,5,69999.2,0.1,l2,False
5,0.808142,0.78037,0.837963,1.0,10,65841.8,0.5,l2,True
6,0.807999,0.783921,0.833603,0.3,5,71814.4,0.1,l2,False
7,0.80789,0.779102,0.838886,1.0,10,65947.2,0.5,l2,False
8,0.807753,0.77682,0.841252,1.0,10,64564.4,1.0,l2,False
9,0.807531,0.78316,0.833468,0.3,5,71705.0,0.1,l2,True


#### Training

In [15]:
col = "CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_TD"
df = get_hyper_param_results(col)
df = df[(df.beta == top_row.beta) & (df.max_epochs == top_row.max_epochs) & (df.C == top_row.C)]
df = df[(df.penalty == top_row.penalty) & (df.dual == top_row.dual)]
df

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,beta,max_epochs,num_feats_MEAN,C,penalty,dual
39,0.897895,0.850558,0.950812,0.3,5,64879.2,0.5,l2,True


In [16]:
# Got here without error
print("Success")

Success
