/
column.py
209 lines (163 loc) · 6.67 KB
/
column.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import logging
import numpy
import pandas
from pandas.api.types import is_categorical_dtype
__all__ = ['categorical_to_numeric', 'encode_categorical', 'standardize']
def _apply_along_column(array, func1d, **kwargs):
if isinstance(array, pandas.DataFrame):
return array.apply(func1d, **kwargs)
return numpy.apply_along_axis(func1d, 0, array, **kwargs)
def standardize_column(series_or_array, with_std=True):
d = series_or_array.dtype
if issubclass(d.type, numpy.number):
output = series_or_array.astype(float)
m = series_or_array.mean()
output -= m
if with_std:
s = series_or_array.std(ddof=1)
output /= s
return output
return series_or_array
def standardize(table, with_std=True):
"""
Perform Z-Normalization on each numeric column of the given table.
If `table` is a pandas.DataFrame, only numeric columns are modified,
all other columns remain unchanged. If `table` is a numpy.ndarray,
it is only modified if it has numeric dtype, in which case the returned
array will have floating point dtype.
Parameters
----------
table : pandas.DataFrame or numpy.ndarray
Data to standardize.
with_std : bool, optional, default: True
If ``False`` data is only centered and not converted to unit variance.
Returns
-------
normalized : pandas.DataFrame
Table with numeric columns normalized.
Categorical columns in the input table remain unchanged.
"""
if isinstance(table, pandas.DataFrame):
cat_columns = table.select_dtypes(include=['category']).columns
else:
cat_columns = []
new_frame = _apply_along_column(table, standardize_column, with_std=with_std)
# work around for apply converting category dtype to object
# https://github.com/pydata/pandas/issues/9573
for col in cat_columns:
new_frame[col] = table[col].copy()
return new_frame
def _encode_categorical_series(series, allow_drop=True):
values = _get_dummies_1d(series, allow_drop=allow_drop)
if values is None:
return
enc, levels = values
if enc is None:
return pandas.Series(index=series.index, name=series.name, dtype=series.dtype)
if not allow_drop and enc.shape[1] == 1:
return series
names = []
for key in range(1, enc.shape[1]):
names.append("{}={}".format(series.name, levels[key]))
series = pandas.DataFrame(enc[:, 1:], columns=names, index=series.index)
return series
def encode_categorical(table, columns=None, **kwargs):
"""
Encode categorical columns with `M` categories into `M-1` columns according
to the one-hot scheme.
Parameters
----------
table : pandas.DataFrame
Table with categorical columns to encode.
columns : list-like, optional, default: None
Column names in the DataFrame to be encoded.
If `columns` is None then all the columns with
`object` or `category` dtype will be converted.
allow_drop : boolean, optional, default: True
Whether to allow dropping categorical columns that only consist
of a single category.
Returns
-------
encoded : pandas.DataFrame
Table with categorical columns encoded as numeric.
Numeric columns in the input table remain unchanged.
"""
if isinstance(table, pandas.Series):
if not is_categorical_dtype(table.dtype) and not table.dtype.char == "O":
raise TypeError("series must be of categorical dtype, but was {}".format(table.dtype))
return _encode_categorical_series(table, **kwargs)
def _is_categorical_or_object(series):
return is_categorical_dtype(series.dtype) or series.dtype.char == "O"
if columns is None:
# for columns containing categories
columns_to_encode = {nam for nam, s in table.iteritems() if _is_categorical_or_object(s)}
else:
columns_to_encode = set(columns)
items = []
for name, series in table.iteritems():
if name in columns_to_encode:
series = _encode_categorical_series(series, **kwargs)
if series is None:
continue
items.append(series)
# concat columns of tables
new_table = pandas.concat(items, axis=1, copy=False)
return new_table
def _get_dummies_1d(data, allow_drop=True):
# Series avoids inconsistent NaN handling
cat = pandas.Categorical(data)
levels = cat.categories
number_of_cols = len(levels)
# if all NaN or only one level
if allow_drop and number_of_cols < 2:
logging.getLogger(__package__).warning(
"dropped categorical variable '%s', because it has only %d values", data.name, number_of_cols)
return
if number_of_cols == 0:
return None, levels
dummy_mat = numpy.eye(number_of_cols).take(cat.codes, axis=0)
# reset NaN GH4446
dummy_mat[cat.codes == -1] = numpy.nan
return dummy_mat, levels
def categorical_to_numeric(table):
"""Encode categorical columns to numeric by converting each category to
an integer value.
Parameters
----------
table : pandas.DataFrame
Table with categorical columns to encode.
Returns
-------
encoded : pandas.DataFrame
Table with categorical columns encoded as numeric.
Numeric columns in the input table remain unchanged.
"""
def transform(column):
if is_categorical_dtype(column.dtype):
return column.cat.codes
if column.dtype.char == "O":
try:
nc = column.astype(numpy.int64)
except ValueError:
classes = column.dropna().unique()
classes.sort(kind="mergesort")
nc = column.replace(classes, numpy.arange(classes.shape[0], dtype=numpy.int64))
return nc
if column.dtype == bool:
return column.astype(numpy.int64)
return column
if isinstance(table, pandas.Series):
return pandas.Series(transform(table), name=table.name, index=table.index)
return table.apply(transform, axis=0, result_type='expand')