/
model_utils.py
161 lines (144 loc) · 6.46 KB
/
model_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import numpy as np
import matplotlib.pyplot as plt
import dython.nominal as nominal
from scipy import interp
from sklearn.metrics import roc_curve, auc
from dython._private import convert
def _display_plot():
plt.plot([0, 1], [0, 1], color='grey', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.02])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
def binary_roc_graph(y_true, y_pred, **kwargs):
"""
This function plots a ROC graph of a binary-class predictor. AUC calculation are presented as-well.
Data can be either: (1) one dimensional, where the values of y_true represent the true class and y_pred the
predicted probability of that class, or (2) two-dimensional, where each line in y_true is a one-hot-encoding
of the true class and y_pred holds the predicted probabilities of each class.
For example, consider a data-set of two data-points where the true class of the first line is class 0, which
was predicted with a probability of 0.6, and the second line's true class is 1, with predicted probability of
0.8. In the first configuration, the input will be: y_true = [0,1], y_pred = [0.6,0.8]. In the second
configuration, the input will be: y_true = [[1,0],[0,1]], y_pred = [[0.6,0.4],[0.2,0.8]].
Based on sklearn examples (as was seen on April 2018):
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
Parameters
----------
y_true : list / NumPy ndarray
The true classes of the predicted data
y_pred : list / NumPy ndarray
The predicted classes
kwargs : any key-value pairs
Different options and configurations
"""
y_true = convert(y_true, 'array')
y_pred = convert(y_pred, 'array')
if y_pred.shape != y_true.shape:
raise ValueError('y_true and y_pred must have the same shape')
elif len(y_pred.shape) == 1:
y_t = y_true
y_p = y_pred
else:
y_t = [np.argmax(x) for x in y_true]
y_p = [x[1] for x in y_pred]
fpr, tpr, _ = roc_curve(y_t, y_p)
auc_score = auc(fpr,tpr)
color = kwargs.get('color','darkorange')
lw = kwargs.get('lw', 2)
ls = kwargs.get('ls','-')
ms = kwargs.get('ms', 10)
fmt = kwargs.get('fmt','.2f')
if 'class_label' in kwargs:
class_label = ': {}'.format(kwargs['class_label'])
else:
class_label = ''
if kwargs.get('new_figure',True):
plt.figure()
plt.plot(fpr, tpr, color=color, lw=lw, ls=ls, label='ROC curve{class_label} (AUC = {auc:{fmt}})'
.format(class_label=class_label,auc=auc_score,fmt=fmt))
if kwargs.get('show_graphs',True):
_display_plot()
if kwargs.get('return_pr',False):
return {'fpr': fpr, 'tpr': tpr}
def _plot_macro_roc(fpr, tpr, n, **kwargs):
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= n
fpr_macro = all_fpr
tpr_macro = mean_tpr
auc_macro = auc(fpr_macro, tpr_macro)
fmt = kwargs.get('fmt', '.2f')
lw = kwargs.get('lw', 2)
plt.plot(fpr_macro, tpr_macro, label='ROC curve: macro (AUC = {auc:{fmt}})'.format(auc=auc_macro,fmt=fmt),
color='navy', ls=':', lw=lw)
def roc_graph(y_true, y_pred, micro=True, macro=True, **kwargs):
"""
Plot a ROC graph of predictor's results (inclusding AUC scores), where each row of y_true and y_pred
represent a single example.
If there are 1 or two columns only, the data is treated as a binary classification, in which
the result is similar to the `binary_roc_graph` method, see its documentation for more information.
If there are more then 2 columns, each column is considered a unique class, and a ROC graph and AUC
score will be computed for each. A Macro-ROC and Micro-ROC are computed and plotted too by default.
Based on sklearn examples (as was seen on April 2018):
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
**Example:** See `roc_graph_example` under `dython.examples`
Parameters
----------
y_true : list / NumPy ndarray
The true classes of the predicted data
y_pred : list / NumPy ndarray
The predicted classes
micro : Boolean, default = True
Whether to calculate a Micro ROC graph (not applicable for binary cases)
macro : Boolean, default = True
Whether to calculate a Macro ROC graph (not applicable for binary cases)
kwargs : any key-value pairs
Different options and configurations
"""
all_fpr = list()
all_tpr = list()
y_true = convert(y_true, 'array')
y_pred = convert(y_pred, 'array')
if y_pred.shape != y_true.shape:
raise ValueError('y_true and y_pred must have the same shape')
elif len(y_pred.shape) == 1 or y_pred.shape[1] <= 2:
return binary_roc_graph(y_true, y_pred, **kwargs)
else:
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
n = y_pred.shape[1]
plt.figure()
kwargs['new_figure'] = False
kwargs['show_graphs'] = False
kwargs['return_pr'] = True
for i in range(0,n):
pr = binary_roc_graph(y_true[:,i], y_pred[:,i],
color=colors[i % len(colors)],class_label=i, **kwargs)
all_fpr.append(pr['fpr'])
all_tpr.append(pr['tpr'])
if micro:
binary_roc_graph(y_true.ravel(), y_pred.ravel(), ls=':',
color='deeppink', class_label='micro', **kwargs)
if macro:
_plot_macro_roc(all_fpr,all_tpr,n)
_display_plot()
def random_forest_feature_importance(forest, features, **kwargs):
"""
Given a trained `sklearn.ensemble.RandomForestClassifier`, plot the different features based on their
importance according to the classifier, from the most important to the least.
Parameters
----------
forest : sklearn.ensemble.RandomForestClassifier
A trained `RandomForestClassifier`
features : list
A list of the names of the features the classifier was trained on, ordered by the same order the appeared
in the training data
kwargs : any key-value pairs
Different options and configurations
"""
return sorted(zip(map(lambda x: round(x, kwargs.get('precision',4)), forest.feature_importances_), features),
reverse=True)