/
test_preprocessing.py
103 lines (79 loc) · 3.51 KB
/
test_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from collections import OrderedDict
import numpy as np
import pandas as pd
import pandas.util.testing as tm
import pytest
from sksurv.preprocessing import OneHotEncoder
def _encoded_data(data):
expected = []
for nam, col in data.iteritems():
if hasattr(col, "cat"):
for cat in col.cat.categories[1:]:
name = '{}={}'.format(nam, cat)
s = pd.Series(col == cat, dtype=np.float64)
expected.append((name, s))
else:
expected.append((nam, col))
expected_data = pd.DataFrame.from_dict(OrderedDict(expected))
return expected_data
@pytest.fixture
def create_data():
def _create_data(n_samples=117):
rnd = np.random.RandomState(51365192)
data_num = pd.DataFrame(rnd.rand(n_samples, 5),
columns=["N%d" % i for i in range(5)])
dat_cat = pd.DataFrame(OrderedDict([
("binary_1", pd.Categorical.from_codes(
rnd.binomial(1, 0.6, n_samples),
["Yes", "No"])),
("binary_2", pd.Categorical.from_codes(
rnd.binomial(1, 0.376, n_samples),
["East", "West"])),
("trinary", pd.Categorical.from_codes(
rnd.binomial(2, 0.76, n_samples),
["Green", "Blue", "Red"])),
("many", pd.Categorical.from_codes(
rnd.binomial(5, 0.47, n_samples),
["One", "Two", "Three", "Four", "Five", "Six"]))
]))
data = pd.concat((data_num, dat_cat), axis=1)
return data, _encoded_data(data)
return _create_data
class TestOneHotEncoder(object):
@staticmethod
def test_fit(create_data):
data, expected_data = create_data()
t = OneHotEncoder().fit(data)
assert t.feature_names_.tolist() == ['binary_1', 'binary_2', 'trinary', 'many']
assert set(t.encoded_columns_) == set(expected_data.columns)
assert t.categories_ == {k: data[k].cat.categories
for k in ['binary_1', 'binary_2', 'trinary', 'many']}
@staticmethod
def test_fit_transform(create_data):
data, expected_data = create_data()
actual_data = OneHotEncoder().fit_transform(data)
tm.assert_frame_equal(actual_data, expected_data)
@staticmethod
def test_transform(create_data):
data, _ = create_data()
t = OneHotEncoder().fit(data)
data, expected_data = create_data(165)
actual_data = t.transform(data)
tm.assert_frame_equal(actual_data, expected_data)
data = pd.concat((data.iloc[:, :2], data.iloc[:, 5:], data.iloc[:, 2:5]), axis=1)
actual_data = t.transform(data)
tm.assert_frame_equal(actual_data, expected_data)
@staticmethod
def test_transform_other_columns(create_data):
data, _ = create_data()
t = OneHotEncoder().fit(data)
data, _ = create_data(125)
data_renamed = data.rename(columns={"binary_1": "renamed_1"})
with pytest.raises(ValueError, match=r"1 features are missing from data: \['binary_1'\]"):
t.transform(data_renamed)
data_dropped = data.drop('trinary', axis=1)
with pytest.raises(ValueError, match=r"1 features are missing from data: \['trinary'\]"):
t.transform(data_dropped)
data_renamed = data.rename(columns={"binary_1": "renamed_1", "many": "too_many"})
with pytest.raises(ValueError, match=r"2 features are missing from data: \['binary_1', 'many'\]"):
t.transform(data_renamed)