-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
100 lines (76 loc) · 2.95 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import numpy as np
#import utils as util
def stripQuotes(data, stripColumns):
changeCount = 0
for col in stripColumns:
for i in data.index.values:
if "'" in data[col][i]:
data.loc[i, col] = data.loc[i, col].replace("'", "")
changeCount = changeCount + 1
#print("Quotes removed from {0} cells.".format(changeCount))
def toLowerCase(data, toLowerColumns):
changeCount = 0
for col in toLowerColumns:
for i in data.index.values:
if data[col][i].islower() == False:
data.loc[i, col] = data.loc[i, col].lower()
changeCount = changeCount + 1
#print("Standardized {0} cells to lower case.".format(changeCount))
def encodeLabels(data, encodeColumns, printLabels):
labelMapping = {}
for col in encodeColumns:
cells = data[col]
uniqueValues = cells.unique()
labelMapping[col] = dict(zip(np.sort(uniqueValues), range(len(uniqueValues))))
# print(labelMapping[col])
for index, row in data.iterrows():
for col in encodeColumns:
data.loc[index, col] = labelMapping[col][row[col]]
for key in printLabels.keys():
pass
#print('Value assigned for {} in column {}: {}.'. format(printLabels[key], key, labelMapping[key].get(printLabels[key], -1)))
def normalizeColumns(data, psParticipants, psPartners):
for index, row in data.iterrows():
sum = 0
for col in psParticipants:
sum = sum + row[col]
for col in psParticipants:
data.loc[index, col] = row[col] / sum
sum = 0
for col in psPartners:
sum = sum + row[col]
for col in psPartners:
data.loc[index, col] = row[col] / sum
# print(data[psParticipants+psPartners])
for col in psParticipants:
#print('Mean of {}: {:.2f}'.format(col, data[col].mean()))
pass
for col in psPartners:
#print('Mean of {}: {:.2f}'.format(col, data[col].mean()))
pass
import sys
if __name__ == "__main__":
if util.final:
# For final result run the following line
columns, data = util.readFile(sys.argv[1],None)
else:
# For testing purpose run the following line.
columns, data = util.readFile('test_dataset.csv')
# Answer to question 1.i
stripQuotes(data, ['race', 'race_o', 'field'])
# Answer to question 1.ii
toLowerCase(data, ['field'])
# Answer to question 1.iii
printLabels = {'gender': 'male',
'race': 'European/Caucasian-American',
'race_o': 'Latino/Hispanic American',
'field': 'law'}
encodeLabels(data, ['gender', 'race', 'race_o', 'field'], printLabels)
# Answer to question 1.iv
normalizeColumns(data, util.psParticipants, util.psPartners)
if util.final:
# Run for final version
data.to_csv(sys.argv[2])
else:
data.to_csv('test_dating.csv')