MODEL PERFORMANCE & ANALYSIS
- F1 Score by p-thresholds
- Feature Importance

In [1]:
import datetime
import os
import pickle
import pandas as pd
import altair as alt
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error
from common.paths import Paths
from common.utils.estimations import f1_score_weighted_returns
from sklearn.metrics import f1_score

In [2]:
ex = 'ex2022-03-27_095928-ethusd'

In [3]:
with open(os.path.join(Paths.trade_model, ex, 'boosters.p'), 'rb') as f:
    boosters = pickle.load(f)
with open(os.path.join(Paths.trade_model, ex, 'pred_label_val.p'), 'rb') as f:
    pred_label_val = pickle.load(f)
with open(os.path.join(Paths.trade_model, ex, 'pred_label_ho.p'), 'rb') as f:
    pred_label_ho = pickle.load(f)
with open(os.path.join(Paths.trade_model, ex, 'label_val.p'), 'rb') as f:
    ps_label = pickle.load(f)
with open(os.path.join(Paths.trade_model, ex, 'label_ho.p'), 'rb') as f:
    ps_label_ho = pickle.load(f)
with open(os.path.join(Paths.trade_model, ex, 'return_val.p'), 'rb') as f:
    ps_label_return_val = pickle.load(f)
with open(os.path.join(Paths.trade_model, ex, 'return_ho.p'), 'rb') as f:
    ps_label_return_ho = pickle.load( f)
ps_label_return_val.name = 'return'
ps_label_return_ho.name = 'return'

In [62]:
df2check = pred_label_val
ps_return = ps_label_return_val

In [5]:
# this is a bad metric due to rounding of scores
from sklearn.metrics import f1_score
print('VALIDATION')
for i, side in enumerate(['short', 'flat', 'long']):
    print(f"{side}: {f1_score(np.where(pred_label_val['label'] == i, 1, 0), pred_label_val[side].round().values)}")
print('HOLDOUT')
for i, side in enumerate(['short', 'flat', 'long']):
    print(f"{side}: {f1_score(np.where(pred_label_ho['label'] == i, 1, 0), pred_label_ho[side].round().values)}")

VALIDATION
short: 0.0
flat: 0.9449074124771405
long: 0.0
HOLDOUT
short: 0.0
flat: 0.9504346499923746
long: 0.0


In [6]:
def normed_feature_importance(boosters) -> pd.Series:
    # sum of importances will be 1
    from common.utils.util_func import get_model_fscore
    importances = [get_model_fscore(booster) for booster in boosters]
    df_imp = pd.DataFrame(importances).mean(axis=0).sort_values(ascending=False)
    df_imp = df_imp / df_imp.sum()
    return df_imp.sort_values(ascending=False)

In [7]:
ps_imp = normed_feature_importance(boosters)
df_imp = pd.DataFrame(ps_imp.cumsum()).reset_index().rename(columns={'index': 'feature', 0: 'score'})

  from pandas import MultiIndex, Int64Index


In [8]:
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
  "width": 600,
  "data": {
    "values": df_imp.to_dict('records')
  },
  "mark": "bar",
  "encoding": {
    "x": {"field": "feature", "type": "ordinal",
          "sort": "y"},
      "y": {"field": "score", "type": "quantitative"},
  },
})

In [9]:
feat_keep = df_imp[df_imp['score'] < 0.99]['feature'].to_list()

In [11]:
print(len(feat_keep))
print(feat_keep)

100
['_measurement-trade_bars|_field-imbalance_size|asset-adausd|exchange-bitfinex|information-imbalance|unit-usd|unit_size-1500|aggWindow-256|aggAggregator-sum', '_measurement-trade_bars|_field-sequence_direction|asset-solusd|exchange-bitfinex|information-sequence|unit-usd|unit_size-3000|aggWindow-256|aggAggregator-sum', '_measurement-trade_bars|_field-imbalance_size|asset-adausd|exchange-bitfinex|information-imbalance|unit-usd|unit_size-1500|aggWindow-1024|aggAggregator-sum', '_measurement-trade_bars|_field-imbalance_size|asset-adausd|exchange-bitfinex|information-imbalance|unit-adausd|unit_size-1000|aggWindow-512|aggAggregator-sum', '_measurement-trade_bars|_field-sequence_direction|asset-adausd|exchange-bitfinex|information-sequence|unit-tick|unit_size-30|aggWindow-1024|aggAggregator-sum', '_measurement-order_book|_field-count_net|asset-ethusd|delta_size_ratio-0.5|exchange-bitfinex|information-bid_buy_count_imbalance_net|unit-size_ewm_sum|aggWindow-1024|aggAggregator-min', '_measur

MODEL SCORES

In [22]:
df2check['short'].max()

0.21423810652663003

In [15]:
min_short = 0.30
min_long = 0.30
ix_pos_long = np.where((df2check['long'] > df2check['short']) & (df2check['long'] > min_long))[0]
ix_pos_short = np.where((df2check['short'] > df2check['long']) & (df2check['short'] > min_short))[0]
ix_pos = ix_pos_long.tolist() + ix_pos_short.tolist()

#ix_neg_long = np.array(list(set(np.where(label2check > threshold)[0]).difference(ix_pos)))
#ix_neg_short = np.array(list(set(np.where(label2check < (2-threshold))[0]).difference(ix_pos)))
#ix_neg = np.array(ix_neg_long.tolist() + ix_neg_short.tolist())

Correlate Confidence and Return

In [68]:
side = 'long'
q = 0.9
iloc = df2check.reset_index().index[df2check[side] > df2check[side].quantile(q)]
df = pd.concat((df2check.iloc[iloc], ps_return.iloc[iloc]), axis=1)
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
  "width": 600,
    "title": "Short preg against return",
    "description": "Ideally error towards zero with lower spread",
  "data": {
    "values": df[[side, "return"]].sample(1000).to_dict('records')
  },
  "mark": {
      "type": "point",
    "size": 1,
  },
  "encoding": {
    "x": {"field": side, "type": "quantitative", "title": "Preds Short", "scale": {"domain": [df[side].min(), df[side].max()]}},
    "y": {"field": "return", "type": "quantitative", "title": "Return", "scale": {"domain": [0.95, 1.05]}},
#       "color": {"field": "B"},
  },
})

In [51]:
ps_label_return_ho[ix_pos_long].mean()

0.9992912519346546

In [8]:
f1_score_weighted_returns(ix_pos_long, ix_pos_short, ix_neg, df_wide['Q-0.5'], label2check)

Weighted F1 Score: 0.10903248609238438


PREDICTION ERRORS

In [9]:
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  "width": 700,
    "data": {
        "values": df_long.sample(1000).to_dict('records')
      },
    "mark": {"type": "line", "tooltip": True},
    "encoding": {
            "x": {"field": "index", "type": "temporal"},
            "y": {"field": "value", "type": "quantitative", "scale": {"domain": [0.985, 1.015]}},
            "color": {"field": "cat"}
  },
})

In [10]:
for quantile in pred_label_val.keys():
    print(f'QUANTILE: {quantile}')
    print(f"VALIDATION: "
          f"MAE: {mean_absolute_error(pred_label_val[quantile].iloc[:, 0], pred_label_val[quantile]['label'])} "
          f"MSE: {mean_squared_error(pred_label_val[quantile].iloc[:, 0], pred_label_val[quantile]['label'])}")
    print(f"HOLDOUT: "
          f"MAE: {mean_absolute_error(pred_label_ho[quantile].iloc[:, 0], pred_label_ho[quantile]['label'])} "
          f"MSE: {mean_squared_error(pred_label_ho[quantile].iloc[:, 0], pred_label_ho[quantile]['label'])}")
    print(f"RETURN == 1: "
          f"MAE: {mean_absolute_error(np.ones(len(pred_label_ho[quantile])), pred_label_ho[quantile]['label'])} "
          f"MSE: {mean_squared_error(np.ones(len(pred_label_ho[quantile])), pred_label_ho[quantile]['label'])}")

QUANTILE: 0.1
VALIDATION: MAE: 0.009731231291461825 MSE: 0.00013420873181948828
HOLDOUT: MAE: 0.013125575221637579 MSE: 0.00022320699674800194
RETURN == 1: MAE: 0.005701243744073382 MSE: 7.202781170024736e-05
QUANTILE: 0.5
VALIDATION: MAE: 0.0065367191767678684 MSE: 6.688001684383511e-05
HOLDOUT: MAE: 0.0065427650752758835 MSE: 8.68992002673856e-05
RETURN == 1: MAE: 0.005701243744073382 MSE: 7.202781170024736e-05
QUANTILE: 0.9
VALIDATION: MAE: 0.007573369606306947 MSE: 8.377127783130546e-05
HOLDOUT: MAE: 0.006664719722744587 MSE: 8.035628634893426e-05
RETURN == 1: MAE: 0.005701243744073382 MSE: 7.202781170024736e-05


In [11]:
lst = []
for quantile in pred_label_val.keys():
    lst.append({'stage': 'VAL',
                'quantile': quantile,
                'MAE': mean_absolute_error(pred_label_val[quantile].iloc[:, 0], pred_label_val[quantile]['label']),
                'MSE': mean_squared_error(pred_label_val[quantile].iloc[:, 0], pred_label_val[quantile]['label'])
                })
    lst.append({'stage': 'HO',
                'quantile': quantile,
                'MAE': mean_absolute_error(pred_label_ho[quantile].iloc[:, 0], pred_label_ho[quantile]['label']),
                'MSE': mean_squared_error(pred_label_ho[quantile].iloc[:, 0], pred_label_ho[quantile]['label'])
                })
    lst.append({'stage': 'HO - BASELINE PRED=1',
                'quantile': quantile,
                'MAE': mean_absolute_error(np.ones(len(pred_label_ho[quantile])), pred_label_ho[quantile]['label']),
                'MSE': mean_squared_error(np.ones(len(pred_label_ho[quantile])), pred_label_ho[quantile]['label'])
                })

In [12]:
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  "width": 600,
  "data": {
    "values": lst
  },
  "mark": "line",
  "encoding": {
    "x": {"field": "quantile", "type": "ordinal"},
    "y": {"field": "MAE", "type": "quantitative"},
      "color": {"field": "stage"},
  },
})

PREDICTION ERRORS by confidence. Smaller confidence, lower error ?

In [13]:
df=pd.concat([df[[0]] for df in pred_label_ho.values()] + [pred_label_ho[0.1]["label"]], axis=1)
df.columns=list(pred_label_ho.keys()) + ['label']
df['confidence_spread'] = df[0.9] - df[0.1]

In [14]:
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  "title": "Confidence Spread over time",
  "width": 600,
  "data": {
    "values": df.reset_index()[['index', 'confidence_spread']].sample(1000).to_dict('records')
  },
  "mark": "line",
  "encoding": {
    "x": {"field": "index", "type": "temporal"},
    "y": {"field": "confidence_spread", "type": "quantitative"},
  },
})

In [15]:
df['mid'] = df['0.5']
df['low'] = df['0.1']
df['up'] = df['0.9']

KeyError: '0.5'

In [None]:
quantile_mae = {}
quantiles = list(range(1, 10))
for quantile in quantiles:
    threshold_high = df['0.5'].quantile(quantile/10)
    threshold_low = df['0.5'].quantile((quantile-1)/10)
    ix = np.where((df['0.5'] > threshold_low) & (df['0.5'] < threshold_high))[0]
    quantile_mae[round(threshold_high - (threshold_high - threshold_low)/2, 3)] = mean_absolute_error(df.iloc[ix]['0.5'], df.iloc[ix]['label'])

In [None]:
# For this one want to have count sample as well
# instead 
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  "width": 600,
    "title": "Quantile of Predicted return",
    "description": "Ideally error towards zero with lower spread",
  "data": {
    "values": [{'quantile': k, 'Loss': v} for k, v in quantile_mae.items()]
  },
  "mark": "line",
  "encoding": {
    "x": {"field": "quantile", "type": "ordinal", "title": "Quantile of Predicted return"},
    "y": {"field": "Loss", "type": "quantitative", "title": "MAE"},
  },
})

In [None]:
quantile_mae = {}
quantiles = list(range(1, 10))
for quantile in quantiles:
    threshold_high = df['confidence_spread'].quantile(quantile/10)
    threshold_low = df['confidence_spread'].quantile((quantile-1)/10)
    ix = np.where((df['confidence_spread'] > threshold_low) & (df['confidence_spread'] < threshold_high))[0]
    quantile_mae[quantile] = mean_absolute_error(df.iloc[ix]['0.5'], df.iloc[ix]['label'])

In [None]:
df.columns = [str(c) for c in df.columns]
df["Loss"] = df["0.5"] - df['label']

In [None]:
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  "width": 600,
    "title": "Regression error against confidence spread of 0.1 and 0.9 line",
    "description": "Ideally error towards zero with lower spread",
  "data": {
    "values": df[["Loss", "confidence_spread"]].sample(1000).to_dict('records')
  },
  "mark": {
      "type": "point",
    "size": 1,  
  },
  "encoding": {
    "x": {"field": "confidence_spread", "type": "quantitative", "title": "Confidence spread"},
    "y": {"field": "Loss", "type": "quantitative", "title": "0.5 Regression Error"
#           "scale": {"domain": [0.95, 1]}
         },
#       "color": {"field": "B"},
  },
})

In [None]:
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  "width": 600,
   "title": "Tranches of Confidence Spread against 0.5 MAE Loss.",
    "description": "Ideally -x**2 mirror parabolic",
  "data": {
    "values": [{'quantile': k, 'Loss': v} for k, v in quantile_mae.items()]
  },
  "mark": "line",
  "encoding": {
    "x": {"field": "quantile", "type": "ordinal", "title": "Quantile of regression confidence spread"},
    "y": {"field": "Loss", "type": "quantitative", "title": "Loss of 0.5 qt regression model"
#           "scale": {"domain": [0.95, 1]}
         },
#       "color": {"field": "stage"},
  },
})

How does the error correlate significant high return events?

In [None]:
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  "width": 600,
    "title": "Future Return against Regression Error. Ideally a horizontal line thinning out on ends.",
  "data": {
    "values": df[["Loss", "label"]].sample(1000).to_dict('records')
  },
  "mark": {
      "type": "point", 
      "size": 1,  
  },
  "encoding": {
    "x": {"field": "label", "type": "quantitative", "title": "Future Return", "scale": {"domain": [0.95, 1.05]}},
    "y": {"field": "Loss", "type": "quantitative", "title": "0.5 Regression Error"},
  },
})

In [None]:
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  "width": 600,
    "title": "Future Return against Predicted. Should ideally be a circle...",
  "data": {
    "values": df[["mid", "label"]].sample(1000).to_dict('records')
  },
  "mark": {
      "type": "point", 
      "size": 1,  
  },
  "encoding": {
    "x": {"field": "label", "type": "quantitative", "title": "Future Return", "scale": {"domain": [0.95, 1.05]}},
    "y": {"field": "mid", "type": "quantitative", "title": "Predicted", "scale": {"domain": [0.95, 1.05]}},
  },
})

In [None]:
alt.Chart.from_dict({
  "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  "width": 600,
    "title": "Future Return against Confidence Range. Best if a reverse parabola",
  "data": {
    "values": df[["confidence_spread", "label"]].sample(1000).to_dict('records')
  },
  "mark": {
      "type": "point", 
      "size": 1,  
  },
  "encoding": {
    "x": {"field": "label", "type": "quantitative", "title": "Future Return", "scale": {"domain": [0.95, 1.05]}},
    "y": {"field": "confidence_spread", "type": "quantitative", "title": "confidence_spread"},
  },
})

More questions. Hypothesis. P values. confidence intervals such as:
Given a predicted return increase of [x1,.. xn], how likely is that the return goes over [y1, .. yn]
Basically, is anything better than guess, at least marginally?

**Mini Backtest**
- Entry:
    - Predicted return </> thresholds
    - Confidence spread < its mean
- Exit conditions:
    - Trailing Stop Loss
    - Timeout
    - Predictions suggest reversal
    - Maybe profit taking ...
Plot Graph 2 images. Prices and PnL Series. Each couple layers (buy sell points)

Feature Importance

In [None]:
# from common.utils.util_func import get_model_fscore
# importances = [get_model_fscore(booster) for booster in self.boosters]
# res = pd.DataFrame(importances).mean(axis=0).sort_values(ascending=False)
# logger.info(res)

In [None]:
from common.utils.util_func import get_model_fscore

importances = [get_model_fscore(booster) for booster in boosters]
res = pd.DataFrame(importances).mean(axis=0).sort_values(ascending=False)

In [None]:
res.plot()

F1 Scores

In [None]:
print(res.iloc[:10])

VALIDATION F1

In [None]:
with open(os.path.join(Paths.trade_model, ex, 'preds.p'), 'rb') as f:
    preds = pickle.load(f)
with open(os.path.join(Paths.trade_model, ex, 'label.p'), 'rb') as f:
    label = pickle.load(f)

In [None]:
f1 = preds.merge(label, how='inner', right_index=True, left_index=True)

HOLDOUT SET VALIDATION

In [None]:
from layers.predictions.predict import Predict

In [None]:
start = datetime.datetime(2022, 2, 17)
end = datetime.datetime(2022, 3, 1)
f1_ho = Predict(boosters, start, end)

In [None]:
# with open(os.path.join(Paths.trade_model, ex, 'f1_ho.p'), 'rb') as f:
#     f1_ho = pickle.load(f)
for i, side in enumerate(['short', 'flat', 'long']):
    print(f"{side}: {f1_score(np.where(f1_ho['label'] == i, 1, 0), f1_ho[side].round().values)}")