-
Notifications
You must be signed in to change notification settings - Fork 0
/
discretize.py
69 lines (53 loc) · 2.07 KB
/
discretize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pandas as pd
import numpy as np
import utils as util
import sys
def makeRanges(data, continuousValuedColumns):
startValues = {}
endValues = {}
for col in continuousValuedColumns:
startValues[col] = 0
endValues[col] = max(10, data[col].max())
# Handle age
startValues['age'] = 18
startValues['age_o'] = 18
endValues['age'] = max(58, data['age'].max())
endValues['age_o'] = max(58, data['age_o'].max())
for col in util.psParticipants:
startValues[col] = 0
endValues[col] = 1
for col in util.psPartners:
startValues[col] = 0
endValues[col] = 1
# Handle correlation
startValues['interests_correlate'] = -1
endValues['interests_correlate'] = max(1, data['interests_correlate'].max())
return startValues, endValues
def continuousToBinConverter(data, columns, binCount):
continuousValuedColumns = list(columns)
for col in util.notContinuousValuedColumns:
continuousValuedColumns.remove(col)
startValues, endValues = makeRanges(data, continuousValuedColumns)
for col in continuousValuedColumns:
ranges = endValues[col] - startValues[col]
binSize = ranges / binCount
bins = [x for x in np.arange(startValues[col], endValues[col] + binSize, binSize)]
labels = [x for x in np.arange(0, len(bins) - 1, 1)]
binnedColumn = pd.cut(data[col], bins=bins, labels=labels, include_lowest=True)
data = data.drop([col], axis=1)
data.insert(columns.index(col), col, binnedColumn)
columns = list(data.columns)
return data, continuousValuedColumns
if __name__ == "__main__":
if util.final:
columns, data = util.readFile(sys.argv[1])
else:
columns, data = util.readFile('test_dating.csv')
data, continuous_columns = continuousToBinConverter(data, columns,5)
for col in continuous_columns:
print(col + ': ', end='')
print(data[col].value_counts().sort_index().tolist())
if util.final:
data.to_csv(sys.argv[2])
else:
data.to_csv('test_dating-binned.csv')