-
Notifications
You must be signed in to change notification settings - Fork 0
/
easyensemble.py
152 lines (113 loc) · 4.56 KB
/
easyensemble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from __future__ import division, print_function
import numpy as np
__all__ = ["EasyEnsemble"]
class EasyEnsemble(object):
"""Class to implement EasyEnsemble algorithm
# Note: Model only applicable to binary classification problems where the labels are 0/1.
Parameters
----------
resamples : int
Number of resamples to use when building classifiers
classifiers : list
A list of classifiers to use for each resample
verbose : boolean
Whether to print status of algorithm training and testing
Returns
-------
self : instance of EasyEnsemble
Note: The number of training models is calculated as resamples*len(classifiers).
Example: If 10 resamples with 3 classifiers, then 30 classifiers will be trained and saved for later use
"""
def __init__(self, resamples = 100, classifiers = None, verbose = False):
self.resamples = resamples
# Force classifiers to be a list
if isinstance(classifiers, list) == False:
classifiers = [classifiers]
self.classifiers = classifiers
self.verbose = verbose
self.training_complete = False
def fit(self, X = None, y = None):
"""Train EasyEnsemble model
Parameters
----------
X : 2d array-like
Feature matrix of training data for input into classifiers
y : 1d array-like
Vector of ground truth labels of training data
Returns
-------
None
Trains an ensemble of models saved as attribute self.trained_models
"""
# Check to make sure labels start at 0
assert(np.min(y) == 0), "Minimum class label should be 0, not %d. Fix labels before training model" % np.min(y)
# Find minority and majority classes and get number of samples in each
class_counts = np.bincount(y.astype('int')) # Convert to integer or else error will be thrown if float32/float64
min_class = np.argmin(class_counts)
maj_class = np.argmax(class_counts)
n_min = class_counts[min_class]
n_maj = class_counts[maj_class]
# Split data into minority and majority classes
min_idx = np.where(y == min_class)[0]
maj_idx = np.where(y == maj_class)[0]
X_min, y_min = X[min_idx], y[min_idx]
X_maj, y_maj = X[maj_idx], y[maj_idx]
# Undersample majority class using bootstrap samples (size == n_min) without replacement and train models
self.trained_models = []
for i in xrange(self.resamples):
# Print training iteration every 10 samples
if self.verbose:
if (i+1) % 10 == 0:
print('Training sample %d/%d\n' % (i+1, self.resamples))
# Bootstrap sample b
idx_boot = np.random.choice(np.arange(n_maj), size = n_min, replace = False)
X_maj_boot = X_maj[idx_boot]
y_maj_boot = y_maj[idx_boot]
# Create balanced training set
X_balanced = np.vstack((X_min, X_maj_boot))
y_balanced = np.concatenate((y_min, y_maj_boot))
# Train classifiers
for model in self.classifiers:
self.trained_models.append(model.fit(X_balanced, y_balanced))
# Set flag to indicate training is finished
self.training_complete = True
def predict(self, X = None):
"""Predict class labels on testing data
Parameters
----------
X : 2d array-like
Feature matrix of testing data
Returns
-------
y_hat : 1d array-like
Vector of predicted class labels
"""
# Make sure models were actually trained
assert(self.training_complete == True), "Need to train models first using .fit() method"
# Make predictions on provided test data
predicted_probs = np.zeros((X.shape[0], 1))
for model in self.trained_models:
predicted_probs = predicted_probs + model.predict_proba(X)
# Normalize probabilites (for level 2 aggregation)
predicted_probs = predicted_probs/len(self.trained_models)
return np.argmax(predicted_probs, axis = 1)
def predict_proba(self, X = None):
"""Predict class probabilities on testing data
Note: Method assumes classifiers have built-in predict_proba() method
Parameters
----------
X : 2d array-like
Feature matrix of testing data
Returns
-------
y_hat : 2d array-like
Vector of predicted class probabilities -> Each row has two dimensions, one dimension for each class probability
"""
# Make sure models were actually trained
assert(self.training_complete == True), "Need to train models first using .fit() method"
# Make predictions on provided test data
predicted_probs = np.zeros((X.shape[0], 1))
for model in self.trained_models:
predicted_probs = predicted_probs + model.predict_proba(X)
# Normalize probabilites (for level 2 aggregation)
return predicted_probs/len(self.trained_models)