# Explore Results

In [174]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [175]:
from pathlib import Path
import pandas as pd
import os
import joblib
import pickle
from src.sklearn_utils import predict_fn, plot_one_vs_rest_success_rates
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix

In [176]:
artifacts_dir = Path(os.path.abspath('')).parent / 'artifacts'
model_path = artifacts_dir / 'gradient_boosting-20220704-153024.joblib'
model = joblib.load(model_path)

In [177]:
model.cv_results_

{'mean_fit_time': array([ 21.69707355,  64.43889155, 132.12126284]),
 'std_fit_time': array([ 0.50096234,  1.39458765, 49.39470151]),
 'mean_score_time': array([0.06359849, 0.0473938 , 0.04681373]),
 'std_score_time': array([0.01985951, 0.00117783, 0.00224644]),
 'param_clf__base_estimator__max_depth': masked_array(data=[2, 4, 6],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__base_estimator__max_depth': 2},
  {'clf__base_estimator__max_depth': 4},
  {'clf__base_estimator__max_depth': 6}],
 'split0_test_score': array([0.07306397, 0.07205387, 0.07474747]),
 'split1_test_score': array([0.05938567, 0.05699659, 0.05836177]),
 'split2_test_score': array([0.06622517, 0.06390728, 0.06523179]),
 'split3_test_score': array([0.05785953, 0.05785953, 0.05719064]),
 'split4_test_score': array([0.0620339 , 0.06372881, 0.06237288]),
 'mean_test_score': array([0.06371365, 0.06290922, 0.06358091]),
 'std_test_score': array([0.00546919, 0

In [178]:
mlb = pickle.load(open(artifacts_dir / 'gradient_boosting-20220704-153024-binarizer.pkl', 'rb'))

In [179]:
df = pd.read_csv(artifacts_dir / 'gradient_boosting-20220704-153024-test.csv')
# convert strings in column to list. e.g. "['A', 'B', 'C']" -> ['A', 'B', 'C'], "['Hello', 'World']" -> ['Hello', 'World']
df['sectors_list'] = df['sectors_list'].apply(lambda x: eval(x))
df["sectors_list"] = df["sectors_list"].apply(
    lambda x: [tag for tag in x if tag in mlb.classes_.tolist()]
)

In [180]:
y_true, y_pred, hl = predict_fn(df, mlb, model)

In [181]:
naive_path = artifacts_dir / 'naive-stratified.joblib'
naive_model = joblib.load(naive_path)

In [182]:
y_pred_naive = naive_model.predict(df)

In [183]:
plot_one_vs_rest_success_rates(y_true, y_pred, y_pred_naive, classes=mlb.classes_)

# Confusion Matrices

In [184]:
# convert multi-label data structure to binary data structure
# y_true_bin = pd.DataFrame(y_true).apply(lambda x: mlb.transform(x).tolist())
# y_true

In [185]:
# dd=pd.DataFrame(y_true)
# # convert 1s in each column to the column number
# # y_true_bin = dd.apply(lambda x: x.index(1)).values.tolist()
# dd

In [186]:
# y_true

In [187]:
# y_pred

In [188]:
# mlb.classes_.tolist()

In [189]:
# multilabel_confusion_matrix(mlb.inverse_transform(y_true), mlb.inverse_transform(y_pred))

In [190]:
# mlb.classes_[0]

In [194]:
mlbcm=multilabel_confusion_matrix(y_true, y_pred)
figs = []
for ix, matrix in enumerate(mlbcm):
    matrix = matrix/matrix.sum(axis=0) # Normalise
    x = y = [mlb.classes_[ix],'other']
    matrix_text = [[str(y) for y in x] for x in matrix]
    # change display rounding
    matrix_text = [[str(round(float(y), 2)) for y in x] for x in matrix_text]
    # set up figure
    fig = ff.create_annotated_heatmap(matrix, x=x, y=y, annotation_text=matrix_text, colorscale='Viridis')

    # add title
    fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
                      xaxis = dict(title='Predicted'),
                      yaxis = dict(title='Actual')
                     )

    # add custom xaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14),
                            x=0.5,
                            y=-0.15,
                            showarrow=False,
                            text="Predicted value",
                            xref="paper",
                            yref="paper"))

    figs.append(fig)







# # change each element of z to type string for annotations
# cm_text = [[str(y) for y in x] for x in cm]
# # change display rounding
# cm_text = [[str(round(y, 2)) for y in x] for x in cm]

In [195]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# fig = make_subplots(rows=5, cols=2)
# for i, figure in enumerate(figs):
#     fig.add_trace(figure)
#     fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
#                       xaxis = dict(title='Predicted'),
#                       yaxis = dict(title='Actual')
#                      )
# fig.show()


In [196]:
for fig in figs:
    fig.show()

In [151]:
def create_confusion_matrix_multilabel(y_true, y_pred, classes):
    true = y_true.argmax(axis=1)
    pred = y_pred.argmax(axis=1)
    classes_dict = {i: label for i, label in enumerate(classes)}
    true = [classes_dict[i] for i in true]
    pred = [classes_dict[i] for i in pred]
    cm = confusion_matrix(true, pred, labels=mlb.classes_, normalize='true')
    return cm


true = y_true.argmax(axis=1)
pred = y_pred.argmax(axis=1)
classes=mlb.classes_
# convert to dictionary of class labels
classes_dict = {i: label for i, label in enumerate(classes)}
# map class labels to integers
true = [classes_dict[i] for i in true]
pred = [classes_dict[i] for i in pred]
cm = confusion_matrix(true, pred, labels=mlb.classes_, normalize='true')
x = y = mlb.classes_.tolist()
# change each element of z to type string for annotations
cm_text = [[str(y) for y in x] for x in cm]
# change display rounding
cm_text = [[str(round(y, 2)) for y in x] for x in cm]

In [155]:
cm

array([[0.95081967, 0.01639344, 0.03278689, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.24528302, 0.49056604, 0.1509434 , 0.01886792, 0.        ,
        0.        , 0.        , 0.0754717 , 0.        , 0.01886792],
       [0.07303371, 0.        , 0.88202247, 0.        , 0.00561798,
        0.00561798, 0.01123596, 0.01123596, 0.        , 0.01123596],
       [0.125     , 0.125     , 0.125     , 0.625     , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.14285714, 0.        , 0.14285714, 0.        , 0.71428571,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.05405405, 0.        , 0.05405405, 0.        , 0.        ,
        0.86486486, 0.        , 0.        , 0.02702703, 0.        ],
       [0.25      , 0.        , 0.08333333, 0.        , 0.        ,
        0.        , 0.66666667, 0.        , 0.        , 0.        ],
       [0.08333333, 0.        , 0.25     

In [113]:
# set up figure
fig = ff.create_annotated_heatmap(cm, x=x, y=y, annotation_text=cm_text, colorscale='Viridis')

# add title
fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
                  xaxis = dict(title='Predicted'),
                  yaxis = dict(title='Actual')
                 )

# add custom xaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))