In [1]:
cd ../..

/Users/shanekercheval/repos/data-science-template


In [2]:
%run "source/config/notebook_settings.py"

# Load Data

In [4]:
file_name = 'artifacts/models/experiments/multi-model-BayesSearchCV-2022-03-18-12-15-35.yaml'
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = file_name)

---

# Hyper-Param Tuning - Cross Validation Results

## Best Scores/Params

In [5]:
results.best_score

0.774092779234226

In [6]:
results.best_params

{'model': 'LogisticRegression()',
 'C': 0.13184996310179986,
 'imputer': "SimpleImputer(strategy='median')",
 'scaler': 'StandardScaler()',
 'pca': 'None',
 'encoder': 'OneHotEncoder()'}

In [None]:
# Best model from each model-type.
df = results.to_formatted_dataframe(return_style=False, include_rank=True)
df["model_rank"] = df.groupby("model")["roc_auc Mean"].rank(method="first", ascending=False)
df.query('model_rank == 1')

In [None]:
results.to_formatted_dataframe(return_style=True,
                               include_rank=True,
                               num_rows=1000)

In [None]:
results.to_formatted_dataframe(query='model == "RandomForestClassifier()"', include_rank=True)

In [None]:
results.to_formatted_dataframe(query='model == "LogisticRegression()"', include_rank=True)

## BayesSearchCV Performance Over Time

In [None]:
results.plot_performance_across_trials(facet_by='model').show()

In [None]:
results.plot_performance_across_trials(query='model == "RandomForestClassifier()"').show()

---

## Variable Performance Over Time

In [None]:
results.plot_parameter_values_across_trials(query='model == "RandomForestClassifier()"').show()

---

## Scatter Matrix

In [None]:
# results.plot_scatter_matrix(query='model == "RandomForestClassifier()"',
#                             height=1000, width=1000).show()

---

## Variable Performance - Numeric

In [None]:
results.plot_performance_numeric_params(query='model == "RandomForestClassifier()"',
                                        height=800)

In [None]:
results.plot_parallel_coordinates(query='model == "RandomForestClassifier()"').show()

---

## Variable Performance - Non-Numeric

In [None]:
results.plot_performance_non_numeric_params(query='model == "RandomForestClassifier()"').show()

---

In [None]:
results.plot_score_vs_parameter(
    query='model == "RandomForestClassifier()"',
    parameter='max_features',
    size='max_depth',
    color='encoder',
)

---

In [None]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='max_depth'
# )

In [None]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='imputer'
# )

---

# Best Model - Test Set Performance

In [7]:
file_name = 'artifacts/models/experiments/multi-model-BayesSearchCV-2022-03-18-12-15-35_best_estimator.pkl'
best_estimator = hlp.utility.read_pickle(file_name)

In [9]:
x_test = pd.read_pickle('artifacts/data/processed/x_test.pkl')
x_test.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
521,<0,18.0,existing paid,radio/tv,3190.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,24.0,none,own,1.0,skilled,1.0,none,yes
737,<0,18.0,existing paid,new car,4380.0,100<=X<500,1<=X<4,3.0,male single,none,4.0,car,35.0,none,own,1.0,unskilled resident,2.0,yes,yes
740,<0,24.0,all paid,new car,2325.0,100<=X<500,4<=X<7,2.0,male single,none,3.0,car,32.0,bank,own,1.0,skilled,1.0,none,yes
660,>=200,12.0,existing paid,radio/tv,1297.0,<100,1<=X<4,3.0,male mar/wid,none,4.0,real estate,23.0,none,rent,1.0,skilled,1.0,none,yes
411,no checking,33.0,critical/other existing credit,used car,7253.0,<100,4<=X<7,3.0,male single,none,2.0,car,35.0,none,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes


In [10]:
y_test = hlp.utility.read_pickle('artifacts/data/processed/y_test.pkl')
y_test[0:10]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [11]:
test_predictions = best_estimator.predict_proba(x_test)[:, 1]
test_predictions[0:10]

array([0.403314  , 0.49843118, 0.59686198, 0.32004122, 0.0857983 ,
       0.35470202, 0.08364243, 0.45742452, 0.09325265, 0.13118732])

In [None]:
evaluator = hlp.sklearn_eval.TwoClassEvaluator(
    actual_values=y_test,
    predicted_scores=test_predictions,
    score_threshold=0.37
)

In [None]:
evaluator.plot_actual_vs_predict_histogram()

In [None]:
evaluator.plot_confusion_matrix()

In [None]:
evaluator.all_metrics_df(return_style=True,
                         dummy_classifier_strategy=['prior', 'constant'],
                         round_by=3)

In [None]:
evaluator.plot_roc_auc_curve().show()

In [None]:
evaluator.plot_precision_recall_auc_curve().show()

In [None]:
evaluator.plot_threshold_curves(score_threshold_range=(0.1, 0.7)).show()

In [None]:
evaluator.plot_precision_recall_tradeoff(score_threshold_range=(0.1, 0.6)).show()

In [None]:
evaluator.calculate_lift_gain(return_style=True)

---