-
Notifications
You must be signed in to change notification settings - Fork 0
/
results_handling.py
271 lines (221 loc) · 9.23 KB
/
results_handling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
"""
This module provides a CrossValidationResults class which can be used to summarize the results of CrossValidation.perform().
"""
import operator
import numpy as np
import pandas as pd
def add_summary_stats(df: pd.DataFrame) -> pd.DataFrame:
"""Add summary statistics to a pandas DataFrame.
Calculates the mean, median and standard deviation on copies slices of the original data and adds them as rows to the DataFrame.
This makes sure, that the summary statistics are not sequentially dependent on each other.
Args:
df: pd.DataFrame: Input data.
Returns:
(pd.DataFrame): Data with added summary statistics
"""
original_fold_slice = df.copy(deep=True)
df.loc["mean"] = original_fold_slice.mean(skipna=True)
df.loc["median"] = original_fold_slice.median(skipna=True)
df.loc["std"] = original_fold_slice.std(skipna=True)
return df
class CrossValidationResults(dict):
"""A summary of the results of CrossValidation.perform().
Cross validate returns a dictionary of results for each model with the form:
```python
{
"model_name_1": {
"model": [model_1_fold_1, model_1_fold_2, ...],
"parameters": [params_1_fold_1, params_1_fold_2, ...],
"metrics": [
{
"metric_1_fold_1": metric_value_1_fold_1,
"metric_2_fold_1": metric_value_2_fold_1,
...
},
{
"metric_1_fold_2": metric_value_1_fold_2,
"metric_2_fold_2": metric_value_2_fold_2,
...
},
],
"y_pred": [y_pred_1_fold_1, y_pred_1_fold_2, ...],
"y_test": [y_test_1_fold_1, y_test_1_fold_2, ...],
"y_pred_train": [y_pred_train_1_fold_1, y_pred_train_1_fold_2, ...],
"y_train": [y_train_1_fold_1, y_train_1_fold_2, ...],
},
"model_name_2": {
...
},
...
}
```
This class is a wrapper around this dictionary which provides a summary of the results.
- `_make_summary` computes the mean, median and standard deviation of the metrics for each model.
- `_make_summary` is called the first time the summary property is accessed and the result is cached.
- `_get_model` returns the model instance corresponding to the given model name.
Properties:
summary (pd.DataFrame): Summary of the results.
"""
def __init__(self, results_dict):
super().__init__(results_dict)
self._summary = None
# TODO add some kind of id or description
def __repr__(self):
return f"CrossValidationResults {pformat_dict(self)}"
@property
def summary(self):
"""Property that returns a pandas dataframe with the fold values, mean, median and standard deviation of the metrics for each model."""
if self._summary is None:
self._summary = self._make_summary()
return self._summary
def _make_summary(self):
"""Creates pandas dataframe with the fold values, mean, median and standard deviation of the metrics for each model.
Columns: model names
Multiindex from tuples: (fold id, metric)
1. It reorders the data from
```python
"metrics": [
{
"metric_1_fold_1": metric_value_1_fold_1,
"metric_2_fold_1": metric_value_2_fold_1,
...
},
{
"metric_1_fold_2": metric_value_1_fold_2,
"metric_2_fold_2": metric_value_2_fold_2,
...
},
],
```
to the form
```python
"metrics": {
"metric_1": [metric_value_1_fold_1, metric_value_1_fold_2, ...],
"metric_2": [metric_value_2_fold_1, metric_value_2_fold_2, ...],
...
}
```
"""
model_dfs = []
for model in self.keys():
metrics_dfs = []
for metric in self[model]["folds_by_metrics"].keys():
# collect the values for each fold for single metric
values_df = pd.DataFrame(
self[model]["folds_by_metrics"][metric], columns=[metric]
)
values_df.index.name = "Fold"
values_df = add_summary_stats(values_df)
metrics_dfs.append(values_df)
# concatenate the dataframes for each metric
model_df = pd.concat(metrics_dfs, axis=1)
# reshape to long format and add model name as column
new_df = model_df.reset_index().melt(
id_vars="Fold", var_name="Metric", value_name=model
)
# set the index to fold and metric
new_df = new_df.set_index(["Fold", "Metric"])
model_dfs.append(new_df)
# concatenate the dataframes for each model
summary_df = pd.concat(model_dfs, axis=1)
return summary_df
def get_best_model_by_metric(
self, model_name=None, metric_name="mse", direction="min"
):
"""Returns the model with the best metric value for the given metric.
Direction can be "min" or "max" and determines whether the best model is the one with the lowest or highest metric value.
E.g. for MSE, direction should be "min" and for R2, direction should be "max".
Args:
model_name (str): Name of the model (Default value = None)
metric_name (str): Name for the metric (Default value = "mse")
direction (str): Minimize or maximize. (Default value = "min")
Returns:
(object): The model with the best metric value for the given metric.
"""
assert direction in [
"min",
"max",
], f"direction must be 'min' or 'max', got {direction}"
arg_func = np.argmin if direction == "min" else np.argmax
op_func = operator.lt if direction == "min" else operator.gt
best_model_name = None
best_metric_value = None
best_metric_index = None
iter_dict = (
self.items() if model_name is None else [(model_name, self[model_name])]
)
for model_name, model_results in iter_dict:
metric_values = model_results["metrics"][metric_name]
metric_index = arg_func(metric_values)
metric_value = metric_values[metric_index]
if best_metric_value is None or op_func(metric_value, best_metric_value):
best_metric_value = metric_value
best_metric_index = metric_index
best_model_name = model_name
return self[best_model_name]["model"][best_metric_index]
def get_predictions(self, model_name, fold_id):
"""Returns the predictions for the given model and fold.
Args:
model_name (str): The name of the model.
fold_id (int): The id of the fold.
Returns:
(array-like): The predictions for the given model and fold.
"""
return self[model_name]["y_pred"][fold_id]
def get_true_values(self, model_name, fold_id):
"""Returns the true values for the given model and fold.
Args:
model_name (str): The name of the model.
fold_id (int): The id of the fold.
Returns:
(array-like): The true values for the given model and fold.
"""
return self[model_name]["y_test"][fold_id]
def get_training_predictions(self, model_name, fold_id):
"""Returns the predictions for the given model and fold.
Args:
model_name (str): The name of the model.
fold_id (int): The id of the fold.
Returns:
(array-like): The training predictions for the given model and fold.
"""
return self[model_name]["y_pred_train"][fold_id]
def get_training_true_values(self, model_name, fold_id):
"""Returns the true values for the given model and fold.
Args:
model_name (str): The name of the model.
fold_id (int): The id of the fold.
Returns:
(array-like): The training true values for the given model and fold.
"""
return self[model_name]["y_train"][fold_id]
def get_params(self, model_name, fold_id):
"""Returns the parameters for the given model and fold.
If the key is not found, returns None and will not raise an error.
Args:
model_name (str): The name of the model.
fold_id (int): The id of the fold.
Returns:
(dict): The parameters for the given model and fold.
"""
return self[model_name].get("parameters", [None])[fold_id]
class MergedSummary(CrossValidationResults):
""" """
def __init__(
self,
cv_results_1,
cv_results_2,
):
self.cv_results_1 = cv_results_1
self.cv_results_2 = cv_results_2
@property
def summary(self):
""" """
return self._merge()
def _merge(self):
"""Merges the two summaries dataframes into one."""
return pd.concat([self.cv_results_1.summary, self.cv_results_2.summary], axis=1)
def __add__(self, other):
raise NotImplementedError
if __name__ == "__main__":
pass