-
Notifications
You must be signed in to change notification settings - Fork 182
/
missing_values_cleaner.py
192 lines (152 loc) · 6.21 KB
/
missing_values_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import numpy as np
from scipy import stats
from skmultiflow.transform.base_transform import StreamTransform
from skmultiflow.utils import FastBuffer, get_dimensions
class MissingValuesCleaner(StreamTransform):
""" Fill missing values with some defined value.
Provides a simple way to replace missing values in data samples with some value. The imputation value
can be set via a set of imputation strategies.
Parameters
----------
missing_value: int, float or list (Default: numpy.nan)
Missing value to replace
strategy: string (Default: 'zero')
The strategy adopted to find the missing value replacement. It can
be one of the following: 'zero', 'mean', 'median', 'mode', 'custom'.
window_size: int (Default: 200)
Defines the window size for the 'mean', 'median' and 'mode' strategies.
new_value: int (Default: 1)
This is the replacement value in case the chosen strategy is 'custom'.
Examples
--------
>>> # Imports
>>> import numpy as np
>>> from skmultiflow.data.file_stream import FileStream
>>> from skmultiflow.transform.missing_values_cleaner import MissingValuesCleaner
>>> # Setting up a stream
>>> stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/"
... "streaming-datasets/master/covtype.csv")
>>> # Setting up the filter to substitute values -47 by the median of the
>>> # last 10 samples
>>> cleaner = MissingValuesCleaner(-47, 'median', 10)
>>> X, y = stream.next_sample(10)
>>> X[9, 0] = -47
>>> # We will use this list to keep track of values
>>> data = []
>>> # Iterate over the first 9 samples, to build a sample window
>>> for i in range(9):
>>> X_transf = cleaner.partial_fit_transform([X[i].tolist()])
>>> data.append(X_transf[0][0])
>>>
>>> # Transform last sample. The first feature should be replaced by the list's
>>> # median value
>>> X_transf = cleaner.partial_fit_transform([X[9].tolist()])
>>> np.median(data)
Notes
-----
A missing value in a sample can be coded in many different ways, but the
most common one is to use numpy's NaN, that's why that is the default
missing value parameter.
The user should choose the correct substitution strategy for his use
case, as each strategy has its pros and cons. The strategy can be chosen
from a set of predefined strategies, which are: 'zero', 'mean', 'median',
'mode', 'custom'.
Notice that `MissingValuesCleaner` can actually be used to replace arbitrary
values.
"""
def __init__(self, missing_value=np.nan, strategy='zero', window_size=200, new_value=1):
super().__init__()
if isinstance(missing_value, list):
self.missing_value = missing_value
else:
self.missing_value = [missing_value]
self.strategy = strategy
self.window_size = window_size
self.window = None
self.new_value = new_value
self.__configure()
def __configure(self):
if self.strategy in ['mean', 'median', 'mode']:
self.window = FastBuffer(max_size=self.window_size)
def transform(self, X):
""" transform
Does the transformation process in the samples in X.
Parameters
----------
X: numpy.ndarray of shape (n_samples, n_features)
The sample or set of samples that should be transformed.
"""
r, c = get_dimensions(X)
for i in range(r):
if self.strategy in ['mean', 'median', 'mode']:
self.window.add_element([X[i][:]])
for j in range(c):
if X[i][j] in self.missing_value or np.isnan(X[i][j]):
X[i][j] = self._get_substitute(j)
return X
def _get_substitute(self, column_index):
""" _get_substitute
Computes the replacement for a missing value.
Parameters
----------
column_index: int
The index from the column where the missing value was found.
Returns
-------
int or float
The replacement.
"""
if self.strategy == 'zero':
return 0
elif self.strategy == 'mean':
if not self.window.is_empty():
return np.nanmean(np.array(self.window.get_queue())[:, column_index])
else:
return self.new_value
elif self.strategy == 'median':
if not self.window.is_empty():
return np.nanmedian(np.array(self.window.get_queue())[:, column_index])
else:
return self.new_value
elif self.strategy == 'mode':
if not self.window.is_empty():
return stats.mode(np.array(self.window.get_queue())[:, column_index],
nan_policy='omit')[0]
else:
return self.new_value
elif self.strategy == 'custom':
return self.new_value
def partial_fit_transform(self, X, y=None):
""" partial_fit_transform
Partially fits the model and then apply the transform to the data.
Parameters
----------
X: numpy.ndarray of shape (n_samples, n_features)
The sample or set of samples that should be transformed.
y: Array-like
The true labels.
Returns
-------
numpy.ndarray of shape (n_samples, n_features)
The transformed data.
"""
X = self.transform(X)
return X
def partial_fit(self, X, y=None):
""" partial_fit
Partial fits the model.
Parameters
----------
X: numpy.ndarray of shape (n_samples, n_features)
The sample or set of samples that should be transformed.
y: Array-like
The true labels.
Returns
-------
MissingValuesCleaner
self
"""
X = np.asarray(X)
if self.strategy in ['mean', 'median', 'mode']:
self.window.add_element(X)
return self