In [10]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
#import seaborn as sns
import pymongo
from pprint import pprint

In [11]:
client = pymongo.MongoClient()
db = client.metrics

In [12]:
def group_by(df, bycols, agg_map):
    """

    @param df:      DataFrame
    @param bycols:  str or list
                        Column(s) to group by
    @param agg_map: dictionary or list of 2-tuples
                        Mapping from column to aggregate function e.g. [("city", "count"), ("salary", "mean"]
    @return:        DataFrame
                        Flattened dataframe, with multi-level index removed
    """
    grps = []
    if type(bycols) == str:
        bycols = [bycols]

    if type(agg_map) == dict:
        agg_map = agg_map.items()

    for k,v in agg_map:
        grp = df[bycols + [k]].groupby(bycols, ).agg(v)
        grp.reset_index(inplace=True)
        grp["%s(%s)" % (v,k)] = grp[k]
        del grp[k]
        grps.append(grp)

    m = grps[0]
    for grp in grps[1:]:
        m = pd.merge(m, grp, on=bycols, how="inner")
    return m

In [54]:
from bson.son import SON # needed to ensure dictionary is ordered (python default is not)
import hashlib

def hash_feats(fts):
    vals = fts.values
    joined = "|".join(map(lambda s: str(s),vals)).encode('utf-8') 
    return hashlib.sha224(joined).hexdigest()

def get_df_sorted_by_f1score(collection, params=None, filter_cols=True):
    if not params:
        params = []
    if type(params) == str:
        params = params.split(",")
    
    project = {
            "weighted_f1_score":"$WEIGHTED_MEAN_CONCEPT_CODES.f1_score",
            "micro_f1_score":  "$MICRO_F1.f1_score",
            "micro_recall":    "$MICRO_F1.recall",
            "micro_precision": "$MICRO_F1.precision",
    
    # PARAMETERS            
            "window_size":    "$parameters.window_size",
            "feats":          "$parameters.extractors",
            "count": {        "$size" : "$parameters.extractors" },
            "asof" :          "$asof",
            "_id":1
    }
    
    # No count for HMM
    if "_hmm" in collection.lower():
        del project["count"]
    
    for param in params:
        project[param] = "$parameters." + param

    feats_pipeline = [{
        "$project": project
    },
    {
        "$match":{
            "micro_f1_score": { "$exists" : True }        
        }
    },
    {
        "$sort":{
            "micro_f1_score": -1
        }
    },
    ]
    
    rows = [row for row in db[collection].aggregate(feats_pipeline)]
    df = pd.DataFrame(rows).sort_values("micro_f1_score", ascending=False)
    if params:
        df["hs_params"] = df[params].apply(hash_feats, axis=1)
        
    if filter_cols:
        cols = ["micro_f1_score", "micro_recall" ,"micro_precision" ] + params
        return df[cols]
    return df

In [55]:
def get_window_classifier_results(prefix):
    collections = "WINDOW_CLASSIFIER_BR,WINDOW_CLASSIFIER_LBL_POWERSET_MULTICLASS,WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS".split(",")
    dfs = []
    for c in collections:
        col = prefix + c
        print(col)
        df = dict(get_df_sorted_by_f1score(col).iloc[0,:])
        df["Collection_" + prefix[:-1]] = col.replace(prefix,"")
        dfs.append(df)
    return pd.DataFrame(dfs).sort_values("micro_f1_score", ascending=False)

# Which Problem Transformation Method Was Best?

In [56]:
def round_data(df, places=3):
    df_copy = df.copy()
    fmt_str = "{0:." + str(places) + "f}"
    cols = set([v for v in df_copy.columns.values if "micro_" in v])
    for c in cols:
        df_copy[c] = df[c].apply(lambda d: fmt_str.format(d))  
    return df_copy

## Coral Bleaching

In [57]:
df = get_window_classifier_results("CB_TAGGING_VD_")
df = round_data(df, 4)
df["Collection_CB_TAGGING_VD,micro_f1_score,micro_recall,micro_precision".split(",")]

CB_TAGGING_VD_WINDOW_CLASSIFIER_BR
CB_TAGGING_VD_WINDOW_CLASSIFIER_LBL_POWERSET_MULTICLASS
CB_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS


Unnamed: 0,Collection_CB_TAGGING_VD,micro_f1_score,micro_recall,micro_precision
2,WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS,0.8317,0.7863,0.8826
1,WINDOW_CLASSIFIER_LBL_POWERSET_MULTICLASS,0.8316,0.7861,0.8827
0,WINDOW_CLASSIFIER_BR,0.8247,0.7708,0.8866


## Skin Cancer

In [58]:
df = get_window_classifier_results("SC_TAGGING_VD_")
for c in ["micro_f1_score","micro_recall","micro_precision"]:
    df[c] = df[c].apply(lambda d: "{0:.4f}".format(d))  
df["Collection_SC_TAGGING_VD,micro_f1_score,micro_recall,micro_precision".split(",")]

SC_TAGGING_VD_WINDOW_CLASSIFIER_BR
SC_TAGGING_VD_WINDOW_CLASSIFIER_LBL_POWERSET_MULTICLASS
SC_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS


Unnamed: 0,Collection_SC_TAGGING_VD,micro_f1_score,micro_recall,micro_precision
1,WINDOW_CLASSIFIER_LBL_POWERSET_MULTICLASS,0.8087,0.7749,0.8456
2,WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS,0.8087,0.7749,0.8456
0,WINDOW_CLASSIFIER_BR,0.8011,0.7592,0.8478


** Unsurprisingly in this case, as there were only two MLC labels, the score for LBL powerset and Common tag are the same **

** HOWEVER - why is the multiclass version that much better? It does OVR, and with only 2 records difference, this makes no sense to me **

# Hyper Parameter Tuning Results

In [59]:
# Rows to print - df.head
ROWS = 5

## Window Based Classifier - Hyper Parameter Tuning

### Coral Bleaching

In [60]:
params = "dual,C,penalty,fit_intercept,multi_class,window_size".split(",")
collection = "CB_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS_HYPER_PARAM_TUNING"

df = get_df_sorted_by_f1score(collection, params)

round_data(df.head(ROWS),4)
#df

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,dual,C,penalty,fit_intercept,multi_class,window_size
0,0.8319,0.7814,0.8892,True,0.5,l2,True,ovr,9
1,0.8318,0.7814,0.8892,False,0.5,l2,True,ovr,9
2,0.8317,0.7863,0.8826,True,1.0,l2,True,ovr,9
3,0.8317,0.7863,0.8826,False,1.0,l2,True,ovr,9
4,0.8286,0.7863,0.8756,False,1.0,l1,True,ovr,9


### Skin Cancer

In [61]:
params = "dual,C,penalty,fit_intercept,multi_class,window_size".split(",")
collection = "SC_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS_HYPER_PARAM_TUNING"

df = get_df_sorted_by_f1score(collection, params)
#df = df[df.dual==True]
#df = df[df.C==0.5]
#df = df[df.penalty =='l2']
round_data(df.head(ROWS),3)

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,dual,C,penalty,fit_intercept,multi_class,window_size
0,0.81,0.773,0.85,True,0.5,l2,True,ovr,9
1,0.81,0.772,0.85,False,0.5,l2,True,ovr,9
2,0.809,0.775,0.846,False,1.0,l2,True,ovr,9
3,0.809,0.775,0.846,True,1.0,l2,True,ovr,9
4,0.809,0.777,0.843,False,1.0,l1,True,ovr,9


### <span style="color:red">Optimal Hyper Parameters are the same for both datasets </span>

## CRF Performance - Hyper Parameter Tuning 

### Coral Bleaching

In [62]:
cols = "micro_f1_score,micro_recall,micro_precision,feature_possible_states,feature_possible_transitions,c2,window_size".split(",")
df = get_df_sorted_by_f1score("CB_TAGGING_VD_CRF_MOST_COMMON_TAG_HYPERPARAM_OPT",
                         "feature_possible_states,feature_possible_transitions,c2,window_size".split(","))

#df = df[df.c2==0.1]
#df = df[df.feature_possible_transitions==True]

round_data(df[cols].head(ROWS), 3)

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,feature_possible_states,feature_possible_transitions,c2,window_size
0,0.831,0.781,0.888,False,True,0.1,9
1,0.831,0.782,0.888,False,False,0.1,9
2,0.831,0.775,0.896,False,False,1.0,9
3,0.83,0.775,0.895,False,True,1.0,9
4,0.813,0.743,0.897,False,True,10.0,9


### Skin Cancer

In [63]:
cols = "micro_f1_score,micro_recall,micro_precision,feature_possible_states,feature_possible_transitions,c2,window_size".split(",")
df = get_df_sorted_by_f1score("SC_TAGGING_VD_CRF_MOST_COMMON_TAG_HYPERPARAM_OPT",
                         "feature_possible_states,feature_possible_transitions,c2,window_size".split(","))

#df = df[df.c2==1.0]
#df = df[df.feature_possible_transitions==True]
round_data(df[cols].head(ROWS), 3)

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,feature_possible_states,feature_possible_transitions,c2,window_size
0,0.799,0.758,0.846,False,True,1.0,9
1,0.798,0.756,0.844,False,False,1.0,9
2,0.789,0.754,0.827,False,False,0.1,9
3,0.789,0.754,0.827,False,True,0.1,9
4,0.785,0.723,0.859,False,True,10.0,9


### <span style="color:red">Optimal C2 differs between datasets</span>

## HMM - Hyper Parameter Tuning (Features in this case)

### Coral Bleaching

In [64]:
params = "extractors".split(",")
collection = "CB_TAGGING_VD_HMM_MOST_COMMON_TAG_MULTICLASS"

df = get_df_sorted_by_f1score(collection, params)
round_data(df.head(ROWS))

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,extractors
0,0.773,0.797,0.75,stemmed_unigrams
1,0.769,0.785,0.754,unigrams


### Skin Cancer

In [65]:
params = "extractors".split(",")
collection = "SC_TAGGING_VD_HMM_MOST_COMMON_TAG_MULTICLASS"

df = get_df_sorted_by_f1score(collection, params)
round_data(df.head(ROWS))

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,extractors
0,0.674,0.733,0.625,stemmed_unigrams
1,0.672,0.719,0.631,unigrams


## Average Perceptron - Hyper Parameter Tuning Results

### Coral Bleaching

In [66]:
model = "AVG_PERCEPTRON_MOST_COMMON_TAG_HYPER_PARAM_TUNING_NEW"
df = get_df_sorted_by_f1score("CB_TAGGING_VD_" + model, 
                              "average_weights,tag_history,window_size")
df.head(ROWS)["micro_f1_score,micro_recall,micro_precision,average_weights,tag_history,window_size,window_size".split(",")]

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,average_weights,tag_history,window_size,window_size.1
0,0.829022,0.778135,0.88703,True,1,9,9
1,0.819187,0.760997,0.887012,True,0,9,9
2,0.778358,0.73553,0.826481,False,1,9,9
3,0.75463,0.739349,0.770557,False,0,9,9


### Skin Cancer

In [76]:
model = "AVG_PERCEPTRON_MOST_COMMON_TAG_HYPER_PARAM_TUNING_NEW"
df = get_df_sorted_by_f1score("SC_TAGGING_VD_" + model, 
                              "average_weights,tag_history,window_size")
df.head(ROWS)["micro_f1_score,micro_recall,micro_precision,average_weights,tag_history,window_size,window_size".split(",")]

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,average_weights,tag_history,window_size,window_size.1
0,0.808588,0.767088,0.854835,True,1,9,9
1,0.802696,0.755411,0.856295,True,0,9,9
2,0.749183,0.734237,0.764749,False,1,9,9
3,0.703646,0.701285,0.706023,False,0,9,9


### <span style="color:red">Optimal tag_history and tag_ngram_size differ between datasets</span>

## RNN

### Coral Bleaching

In [78]:
model = "RNN_MOST_COMMON_TAG_HYPER_PARAM_TUNING"
feats = "use_pretrained_embedding,bi-directional,hidden_size"
df = get_df_sorted_by_f1score("CB_TAGGING_VD_" + model, feats)
round_data(df.head(ROWS)[("micro_f1_score,micro_recall,micro_precision," + feats).split(",")])

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,use_pretrained_embedding,bi-directional,hidden_size
0,0.837,0.822,0.853,True,True,128
1,0.834,0.81,0.858,True,True,128
2,0.833,0.809,0.857,True,True,128
3,0.832,0.817,0.848,True,True,128
4,0.831,0.818,0.845,True,True,128


In [83]:
df

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,use_pretrained_embedding,bi-directional,hidden_size
0,0.82129,0.820946,0.821635,True,True,128
1,0.820644,0.798228,0.844355,True,True,128
2,0.81991,0.813899,0.826011,True,True,128
3,0.818297,0.810073,0.82669,True,True,128
4,0.818031,0.804101,0.832453,True,True,128
5,0.813924,0.789302,0.840131,True,True,128
6,0.808406,0.786786,0.831248,True,True,128
7,0.806134,0.798195,0.814233,False,True,128
8,0.805967,0.79957,0.812466,False,True,128
9,0.805621,0.785846,0.826417,False,True,128


### Skin Cancer

In [82]:
model = "RNN_MOST_COMMON_TAG_HYPER_PARAM_TUNING"
feats = "use_pretrained_embedding,bi-directional,hidden_size"
df = get_df_sorted_by_f1score("SC_TAGGING_VD_" + model, feats)
round_data(df.head(ROWS)[("micro_f1_score,micro_recall,micro_precision," + feats).split(",")])

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,use_pretrained_embedding,bi-directional,hidden_size
0,0.821,0.821,0.822,True,True,128
1,0.821,0.798,0.844,True,True,128
2,0.82,0.814,0.826,True,True,128
3,0.818,0.81,0.827,True,True,128
4,0.818,0.804,0.832,True,True,128


# Test Set Performance

## Window Based Classifier

### Coral Bleaching

In [68]:
params = "dual,C,penalty,fit_intercept,multi_class,window_size".split(",")
collection = "TEST_CB_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS"

df = get_df_sorted_by_f1score(collection, params)
round_data(df.head(ROWS),3)

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,dual,C,penalty,fit_intercept,multi_class,window_size
0,0.842,0.802,0.885,True,0.5,l2,True,ovr,9


### Skin Cancer

In [69]:
params = "dual,C,penalty,fit_intercept,multi_class,window_size".split(",")
collection = "TEST_SC_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS"

df = get_df_sorted_by_f1score(collection, params)
round_data(df.head(ROWS),3)

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,dual,C,penalty,fit_intercept,multi_class,window_size
0,0.814,0.779,0.853,True,0.5,l2,True,ovr,9


## CRF

### Coral Bleaching

In [70]:
cols = "micro_f1_score,micro_recall,micro_precision,feature_possible_states,feature_possible_transitions,c2,window_size".split(",")
df = get_df_sorted_by_f1score("TEST_CB_TAGGING_VD_CRF_MOST_COMMON_TAG",
                         "feature_possible_states,feature_possible_transitions,c2,window_size".split(","))
round_data(df[cols].head(ROWS),3)

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,feature_possible_states,feature_possible_transitions,c2,window_size
0,0.84,0.803,0.881,False,True,0.1,9


## Skin Cancer

In [71]:
cols = "micro_f1_score,micro_recall,micro_precision,feature_possible_states,feature_possible_transitions,c2,window_size".split(",")
df = get_df_sorted_by_f1score("TEST_SC_TAGGING_VD_CRF_MOST_COMMON_TAG",
                         "feature_possible_states,feature_possible_transitions,c2,window_size".split(","))
round_data(df[cols].head(ROWS),3)

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,feature_possible_states,feature_possible_transitions,c2,window_size
0,0.804,0.759,0.855,False,True,1.0,9


## HMM

### Coral Bleaching

In [72]:
params = "extractors".split(",")
collection = "TEST_CB_TAGGING_VD_HMM_MOST_COMMON_TAG_MULTICLASS"

df = get_df_sorted_by_f1score(collection, params)
round_data(df.head(ROWS))

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,extractors
0,0.764,0.803,0.728,stemmed_unigrams


## Skin Cancer

In [73]:
params = "extractors".split(",")
collection = "TEST_SC_TAGGING_VD_HMM_MOST_COMMON_TAG_MULTICLASS"

df = get_df_sorted_by_f1score(collection, params)
round_data(df.head(ROWS))

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,extractors
0,0.675,0.731,0.628,stemmed_unigrams


## Averaged Perceptron

### Coral Bleaching

In [74]:
model = "AVG_PERCEPTRON_MOST_COMMON_TAG"
params = "tag_history,average_weights,tag_plus_word,tag_ngram_size,window_size".split(",")
df = get_df_sorted_by_f1score("TEST_CB_TAGGING_VD_" + model, params)
round_data(df.head(ROWS))

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,tag_history,average_weights,tag_plus_word,tag_ngram_size,window_size
0,0.837,0.794,0.884,1,True,0,0,9


### Skin Cancer

In [75]:
model = "AVG_PERCEPTRON_MOST_COMMON_TAG"
params = "tag_history,average_weights,tag_plus_word,tag_ngram_size,window_size".split(",")
df = get_df_sorted_by_f1score("TEST_SC_TAGGING_VD_" + model, params)
round_data(df.head(ROWS))

Unnamed: 0,micro_f1_score,micro_recall,micro_precision,tag_history,average_weights,tag_plus_word,tag_ngram_size,window_size
0,0.814,0.773,0.86,1,True,0,0,9
