In [5]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pymongo

In [6]:
def group_by(df, bycols, agg_map):
    """

    @param df:      DataFrame
    @param bycols:  str or list
                        Column(s) to group by
    @param agg_map: dictionary or list of 2-tuples
                        Mapping from column to aggregate function e.g. [("city", "count"), ("salary", "mean"]
    @return:        DataFrame
                        Flattened dataframe, with multi-level index removed
    """
    grps = []
    if type(bycols) == str:
        bycols = [bycols]

    if type(agg_map) == dict:
        agg_map = agg_map.items()

    for k,v in agg_map:
        grp = df[bycols + [k]].groupby(bycols, ).agg(v)
        grp.reset_index(inplace=True)
        grp["%s(%s)" % (v,k)] = grp[k]
        del grp[k]
        grps.append(grp)

    m = grps[0]
    for grp in grps[1:]:
        m = pd.merge(m, grp, on=bycols, how="inner")
    return m

In [8]:
client = pymongo.MongoClient()
db = client.metrics

model = "CRF_LBL_POWERSET_HYPERPARAM_OPT"

cb_td, cb_vd, sc_td, sc_vd = "CB_TAGGING_TD_" + model, "CB_TAGGING_VD_" + model, "SC_TAGGING_TD_" + model, "SC_TAGGING_VD_" + model
cb_td, cb_vd, sc_td, sc_vd = db[cb_td], db[cb_vd], db[sc_td], db[sc_vd]

collections = [cb_td, cb_vd, sc_td, sc_vd]
collections

[Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'metrics'), u'CB_TAGGING_TD_CRF_LBL_POWERSET_HYPERPARAM_OPT'),
 Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'metrics'), u'CB_TAGGING_VD_CRF_LBL_POWERSET_HYPERPARAM_OPT'),
 Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'metrics'), u'SC_TAGGING_TD_CRF_LBL_POWERSET_HYPERPARAM_OPT'),
 Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'metrics'), u'SC_TAGGING_VD_CRF_LBL_POWERSET_HYPERPARAM_OPT')]

In [116]:
from pprint import pprint
from bson.son import SON # needed to ensure dictionary is ordered (python default is not)

import hashlib

def hash_feats(fts):
    vals = fts.values
    joined = "|".join(map(str,vals))
    return hashlib.sha224(joined).hexdigest()

def get_df_sorted_by_f1score(collection, params=None):
    if not params:
        params = []
    if type(params) == str:
        params = params.split(",")
    
    project = {
            "weighted_f1_score":"$WEIGHTED_MEAN_CONCEPT_CODES.f1_score",
            "micro_f1_score":  "$MICRO_F1.f1_score",
            "micro_recall":    "$MICRO_F1.recall",
            "micro_precision": "$MICRO_F1.precision",
    # PARAMETERS            
            "window_size":    "$parameters.window_size",
            #"feature_possible_states"      : "$parameters.feature_possible_states",
            #"feature_possible_transitions" : "$parameters.feature_possible_transitions",
            #"c2" :            "$parameters.c2",
            "feats":          "$parameters.extractors",
            #"params":        "$parameters",
            "count": {        "$size" : "$parameters.extractors" },
            "asof" :          "$asof",
            "_id":1
    }
    for param in params:
        project[param] = "$parameters." + param

    feats_pipeline = [{
        "$project": project
    },
    {
        "$match":{
            "micro_f1_score": { "$exists" : True }        
        }
    },
    {
        "$sort":{
            "micro_f1_score": -1
        }
    },
    ]
    
    rows = [row for row in db[collection].aggregate(feats_pipeline)]
    df = pd.DataFrame(rows).sort_values("micro_f1_score", ascending=False)
    if params:
        df["hs_params"] = df[params].apply(hash_feats, axis=1)
    return df

cols = "count,micro_f1_score,feature_possible_states,feature_possible_transitions,c2".split(",")
df = get_df_sorted_by_f1score("CB_TAGGING_VD_CRF_LBL_POWERSET_HYPERPARAM_OPT",
                         "feature_possible_states,feature_possible_transitions,c2".split(","))
df[cols]

Unnamed: 0,count,micro_f1_score,feature_possible_states,feature_possible_transitions,c2
0,6,0.829887,False,True,1.0
1,6,0.829362,False,False,1.0
2,6,0.828921,False,True,0.5
3,6,0.827848,False,True,2.0
4,6,0.826696,False,False,0.1
5,6,0.826487,False,True,0.1
6,6,0.825993,False,True,3.0
7,6,0.823951,False,True,4.0
8,6,0.823901,False,True,5.0
9,6,0.816441,False,False,10.0


In [117]:
model = "AVG_PERCEPTRON_MULTICLASS"
df = get_df_sorted_by_f1score("CB_TAGGING_VD_" + model, 
                              "prev_tag_sharing,num_iterations,tag_history,combo_freq_threshold")
df["hs_params"].value_counts()

6989d0cf246f3bece8a1e87fcbf614e43e21feea60b5b44940dd6a23    2
7f0a96fda2ae0875792ae14fce12cfe6d3a9c5e8f08616ca1e18ac68    1
6d294ca66dad7c2957ee0a439357e8b784d0a9a23db30f9c5c32e0fc    1
a7e34bc128be49015484a2803157e647da1df2b9a34758b7b714bd69    1
8bb06aad8c2f2c24fc0b755e4889492dd12bbab4cb2c9c4280c8097f    1
d23e22c652eb0bf03349a5ee523dd1dd782332e9c7083b52e34e86ff    1
cb04724fb2be2bbb37e8eb16100d53eb054ff3091ee9c6b29d99107e    1
9522e1be4dc132575d87e0ad8fb6c52e430b1c6fedcd4ba7dcf2040d    1
6df2271cb60bd5708083a373b39910a842573f7349840c477d4034d8    1
1c0078ca5dc9dcc2e1b58c920ecea9fcd609d20eec07c7a4e2de19d1    1
ac1f5f3d41cf9464751808636031a972499746b8cd7664015c8723ee    1
709ad79abc4fa79c8f70314acbd828b2f72661f9bbccf6a688132683    1
549cf73100940b34b184f0c2bc1733662af22d56c9951693b986cb62    1
62671cd202235a7b5c34ba29edd1e6c6162f10040cbaa4691c524ca8    1
3e1de563fbf3f2d25013b01d3a1a5c8ad55aa6c7801308aa62d34313    1
3529c6512558b4ca164b679a246e7fcc8b9768b63381b3cd63bb2c9e    1
d6917f70

In [120]:
df[df["hs_params"] == "6989d0cf246f3bece8a1e87fcbf614e43e21feea60b5b44940dd6a23"]#["prev_tag_sharing,num_iterations,tag_history,combo_freq_threshold".split(",")]

Unnamed: 0,_id,asof,combo_freq_threshold,count,feats,micro_f1_score,micro_precision,micro_recall,num_iterations,prev_tag_sharing,tag_history,weighted_f1_score,window_size,hs_params
3,580da4fec1a7cbf7d6ad18e5,2016-10-24 01:06:54.502,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.837016,0.886593,0.792689,10,True,15,0.830776,11,6989d0cf246f3bece8a1e87fcbf614e43e21feea60b5b4...
22,580c855ac1a7cbdacdb3de06,2016-10-23 04:39:38.176,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.834754,0.885112,0.789818,10,True,15,0.828199,11,6989d0cf246f3bece8a1e87fcbf614e43e21feea60b5b4...


In [130]:
df

Unnamed: 0,_id,asof,combo_freq_threshold,count,feats,micro_f1_score,micro_precision,micro_recall,num_iterations,prev_tag_sharing,tag_history,weighted_f1_score,window_size,hs_params
0,580c7b4bc1a7cbdacdb3de04,2016-10-23 03:56:43.655,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.837699,0.887409,0.793264,10,True,10,0.832122,11,3529c6512558b4ca164b679a246e7fcc8b9768b63381b3...
1,580d7c8fc1a7cbf44e46d0f5,2016-10-23 22:14:23.215,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.837588,0.890081,0.790942,5,True,3,0.830714,11,a7e34bc128be49015484a2803157e647da1df2b9a34758...
2,580d73c1c1a7cbf44e46d0f1,2016-10-23 21:36:49.262,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.837174,0.889588,0.790592,5,True,1,0.831135,11,7e60e7390ebd723e74785fc96b261adb53e63d7e4bd893...
3,580da4fec1a7cbf7d6ad18e5,2016-10-24 01:06:54.502,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.837016,0.886593,0.792689,10,True,15,0.830776,11,6989d0cf246f3bece8a1e87fcbf614e43e21feea60b5b4...
4,580d781bc1a7cbf44e46d0f3,2016-10-23 21:55:23.093,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.836976,0.889048,0.790667,5,True,2,0.831062,11,23426360dddef273a4970971963ca808c146d63ee0d9d3...
5,580d8ebbc1a7cbf7d6ad18e1,2016-10-23 23:31:55.576,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.836734,0.885682,0.792914,10,True,8,0.831182,11,709ad79abc4fa79c8f70314acbd828b2f72661f9bbccf6...
6,580c7076c1a7cbdacdb3de02,2016-10-23 03:10:30.662,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.836555,0.886498,0.79194,10,True,5,0.830724,11,26cfd54ee58cb59788b598cefe4c52c2c5723969af1b76...
7,580d6f85c1a7cbf44e46d0ef,2016-10-23 21:18:45.796,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.83646,0.88883,0.789918,5,True,0,0.83015,11,cb04724fb2be2bbb37e8eb16100d53eb054ff3091ee9c6...
8,580c59e3c1a7cbdacdb3ddfe,2016-10-23 01:34:11.096,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.836346,0.886027,0.79194,10,True,1,0.830413,11,d6917f70e519328266aa924fb1a7b436a83bc65a3ac85e...
9,580d9af8c1a7cbf7d6ad18e3,2016-10-24 00:24:08.213,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.836121,0.885554,0.791915,10,True,12,0.83002,11,6d294ca66dad7c2957ee0a439357e8b784d0a9a23db30f...


In [132]:
df[df["hs_params"].isin(["6989d0cf246f3bece8a1e87fcbf614e43e21feea60b5b44940dd6a23"])]

Unnamed: 0,_id,asof,combo_freq_threshold,count,feats,micro_f1_score,micro_precision,micro_recall,num_iterations,prev_tag_sharing,tag_history,weighted_f1_score,window_size,hs_params
3,580da4fec1a7cbf7d6ad18e5,2016-10-24 01:06:54.502,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.837016,0.886593,0.792689,10,True,15,0.830776,11,6989d0cf246f3bece8a1e87fcbf614e43e21feea60b5b4...
22,580c855ac1a7cbdacdb3de06,2016-10-23 04:39:38.176,5,6,"[fn_pos_wd_feats_stemmed[offset:5], fn_pos_ngr...",0.834754,0.885112,0.789818,10,True,15,0.828199,11,6989d0cf246f3bece8a1e87fcbf614e43e21feea60b5b4...


In [133]:
sorted(df["asof"].values)

[numpy.datetime64('2016-10-23T00:47:49.642000000'),
 numpy.datetime64('2016-10-23T01:34:11.096000000'),
 numpy.datetime64('2016-10-23T02:21:45.581000000'),
 numpy.datetime64('2016-10-23T03:10:30.662000000'),
 numpy.datetime64('2016-10-23T03:56:43.655000000'),
 numpy.datetime64('2016-10-23T04:39:38.176000000'),
 numpy.datetime64('2016-10-23T05:15:45.307000000'),
 numpy.datetime64('2016-10-23T06:17:14.175000000'),
 numpy.datetime64('2016-10-23T07:10:05.447000000'),
 numpy.datetime64('2016-10-23T08:04:34.405000000'),
 numpy.datetime64('2016-10-23T09:01:56.280000000'),
 numpy.datetime64('2016-10-23T10:16:49.350000000'),
 numpy.datetime64('2016-10-23T11:26:53.820000000'),
 numpy.datetime64('2016-10-23T19:01:35.079000000'),
 numpy.datetime64('2016-10-23T19:55:05.809000000'),
 numpy.datetime64('2016-10-23T20:09:05.817000000'),
 numpy.datetime64('2016-10-23T20:21:47.180000000'),
 numpy.datetime64('2016-10-23T20:34:10.061000000'),
 numpy.datetime64('2016-10-23T20:46:53.685000000'),
 numpy.datet