# TODO - need to fix the mongo data - 

In [1]:
# db.getCollection('CB_RE-RANKER_HYPER_PARAM_VD').find().forEach(function(doc){
#    db.getCollection('CB_RE-RANKER_HYPER_PARAM_VD_final').insert(doc); // start to replace
# });

# db.getCollection('CB_RE-RANKER__HYPER_PARAM_VD').find().forEach(function(doc){
#    db.getCollection('CB_RE-RANKER_HYPER_PARAM_VD_final').insert(doc); // start to replace
# });


## **AND Repeat with training data**

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
#import seaborn as sns
import pymongo
from pprint import pprint
import dill

from Settings import Settings
from FindFiles import find_files
from results_procesor import ResultsProcessor, metrics_to_df, __MICRO_F1__

# Load Mongo DB Collection

In [3]:
client = pymongo.MongoClient()
db = client.metrics_causal_model

In [4]:
def group_by(df, bycols, agg_map):
    """

    @param df:      DataFrame
    @param bycols:  str or list
                        Column(s) to group by
    @param agg_map: dictionary or list of 2-tuples
                        Mapping from column to aggregate function e.g. [("city", "count"), ("salary", "mean"]
    @return:        DataFrame
                        Flattened dataframe, with multi-level index removed
    """
    grps = []
    if type(bycols) == str:
        bycols = [bycols]

    if type(agg_map) == dict:
        agg_map = agg_map.items()

    for k,v in agg_map:
        grp = df[bycols + [k]].groupby(bycols, ).agg(v)
        grp.reset_index(inplace=True)
        grp["%s(%s)" % (v,k)] = grp[k]
        del grp[k]
        grps.append(grp)

    m = grps[0]
    for grp in grps[1:]:
        m = pd.merge(m, grp, on=bycols, how="inner")
    return m

In [5]:
from bson.son import SON # needed to ensure dictionary is ordered (python default is not)
import hashlib

def hash_feats(fts):
    vals = fts.values
    joined = "|".join(map(lambda s: str(s),vals)).encode('utf-8') 
    return hashlib.sha224(joined).hexdigest()

def get_df_sorted_by_f1score(collection, params=None, filter_cols=True):
    if not params:
        params = []
    if type(params) == str:
        params = params.split(",")
    
    project = {
            "weighted_f1_score":"$WEIGHTED_MEAN_CONCEPT_CODES.f1_score",
            "macro_f1_score":   "$MACRO_F1",
            "micro_f1_score":  "$MICRO_F1.f1_score",
            "micro_recall":    "$MICRO_F1.recall",
            "micro_precision": "$MICRO_F1.precision",
    
    # PARAMETERS            
            "window_size":    "$parameters.window_size",
            "feats":          "$parameters.extractors",
            "count": {        "$size" : "$parameters.extractors" },
            "asof" :          "$asof",
            "_id":1
    }
    
    # No count for HMM
    if "_hmm" in collection.lower():
        del project["count"]
    
    for param in params:
        project[param] = "$parameters." + param

    feats_pipeline = [{
        "$project": project
    },
    {
        "$match":{
            "micro_f1_score": { "$exists" : True }        
        }
    },
    {
        "$sort":{
            "micro_f1_score": -1
        }
    },
    ]
    
    rows = [row for row in db[collection].aggregate(feats_pipeline)]
    df = pd.DataFrame(rows).sort_values("micro_f1_score", ascending=False)
    if params:
        df["hs_params"] = df[params].apply(hash_feats, axis=1)
        
    if filter_cols:
        cols = ["micro_f1_score", "micro_recall" ,"micro_precision", "macro_f1_score" ] + params
        return df[cols]
    return df

In [6]:
def get_window_classifier_results(collections):
    dfs = []
    for col in collections:
        print(col)
        df = dict(get_df_sorted_by_f1score(col).iloc[0,:])
        df["Collection_" + col] = col
        dfs.append(df)
    return pd.DataFrame(dfs).sort_values("micro_f1_score", ascending=False)

In [7]:
def is_a_float(val):
    return "float" in str(type(val))

def round_data(df, places=3):
    df_copy = df.copy()
    fmt_str = "{0:." + str(places) + "f}"
    cols = set([v for v in df_copy.columns.values])
    for c in cols:
        df_copy[c] = df[c].apply(lambda d: fmt_str.format(d) if is_a_float(d) else d)  
    return df_copy

# Hyper Parameter Tuning Results

In [8]:
# Rows to print - df.head
ROWS = 5

- To get the TD and VD metrics, change the collection names below (VD->TD)

## RNN Anaphor Tagger Hyper Parameter Tuning

In [9]:
params = "use_pretrained_embedding,bi-directional,num_rnns,hidden_size".split(",") # merge_mode,

### Coral Bleaching

### Train

In [10]:
collection = "CB_TAGGING_TD_RNN_BINARY_FIXED"

df = get_df_sorted_by_f1score(collection, params)

df = round_data(df.head(ROWS),3)
df[(df.num_rnns == 2) & (df.hidden_size == 256)]

KeyError: 'micro_f1_score'

### Validation

In [None]:
collection = "CB_TAGGING_VD_RNN_BINARY_FIXED"

df = get_df_sorted_by_f1score(collection, params)
    
round_data(df.head(ROWS),3)

### Skin Cancer

### <span style='color:red; font-weight:bold'> For SC - Use the Metrics from the PREDICTION FILES - they don't align with the mongo results </span>

### Train

In [None]:
#params = "dual,C,penalty,fit_intercept,multi_class,window_size".split(",")
collection = "SC_TAGGING_TD_RNN_BINARY_FIXED"

df = get_df_sorted_by_f1score(collection, params)
#df = df[df.dual==True]
#df = df[df.C==0.5]
#df = df[df.penalty =='l2']
df = round_data(df.head(10),3)
# 1 RNN was better for SC dataset
df[(df.num_rnns == 1) & (df.hidden_size == 256)]

### Test

In [None]:
#params = "dual,C,penalty,fit_intercept,multi_class,window_size".split(",")
collection = "SC_TAGGING_VD_RNN_BINARY_FIXED"

df = get_df_sorted_by_f1score(collection, params)
#df = df[df.dual==True]
#df = df[df.C==0.5]
#df = df[df.penalty =='l2']
round_data(df.head(10),3)

# Test Set Performance

In [None]:
ANAPHORA = "Anaphor"
ana_tags = [ANAPHORA]

METRICS_COLS = ["code","f1_score", "precision", "recall", "accuracy", "data_points"]

settings = Settings()

### Coral Bleaching

In [None]:
# load essays
DATASET = "CoralBleaching"
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
anaphor_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/"

# optimal predictions
pattern_cb = "essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
files = find_files(anaphor_predictions_folder, pattern_cb)
with open(files[0], "rb") as f:
    cb_ana_tagged_test  = dill.load(f)
len(cb_ana_tagged_test)

In [None]:
cols = ['code', 'f1_score', 'recall','precision', 'accuracy', 'data_points']

In [None]:
test_metrics = ResultsProcessor.compute_mean_metrics_from_tagged_essays(cb_ana_tagged_test, ana_tags)
test_df = metrics_to_df(test_metrics)
test_df=test_df[np.isin(test_df["code"], [ANAPHORA, __MICRO_F1__])]
df = round_data(test_df[METRICS_COLS],3)
df[cols]

### Skin Cancer

In [None]:
DATASET = "SkinCancer"

root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
anaphor_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/"

# optimal predictions (see results above)
pattern_sc = "essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-1_use_pretrained_embedding-True.dill"
files = find_files(anaphor_predictions_folder, pattern_sc)
with open(files[0], "rb") as f:
    sc_ana_tagged_test  = dill.load(f)
len(sc_ana_tagged_test)

In [None]:
test_metrics = ResultsProcessor.compute_mean_metrics_from_tagged_essays(sc_ana_tagged_test, ana_tags)
test_df = metrics_to_df(test_metrics)
test_df=test_df[np.isin(test_df["code"], [ANAPHORA, __MICRO_F1__])]
round_data(test_df[METRICS_COLS],4)