-
Notifications
You must be signed in to change notification settings - Fork 0
/
createhdf5database.py
176 lines (156 loc) · 6.05 KB
/
createhdf5database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import numpy as np
import time
import timeit
from random import shuffle
import h5py
import os
import pickle
class H5Database():
'''
Class for creating HDF5 dataset
Params:
hdf5path: path to output hdf5 file
datashape: training datashape, (n_rows, n_colms)
buffer_size: hdf5 buffer length for write
train_length: total training samples
test_length : total test samples
'''
def __init__(self, hdf5path,data_shape,buffer_size,train_length, test_length):
self.db = h5py.File(hdf5path, mode = 'w')
self.buffer_size = buffer_size
self.train_length = train_length
self.test_length = test_length
self.sample_shape = data_shape
self.train_db = None
self.test_db = None
self.train_label_db = None
self.test_label_db = None
self.idxs={"index": 0}
self.sample_buffer = []
self.label_buffer =[]
self.create_hdf5_datasets()
self.datasettype = "train"
def initialize_dataset(self, datasettype):
self.idxs['index'] = 0
self.datasettype = datasettype
def flushbuffers(self):
if(self.datasettype == 'train'):
self._write_buffer(self.train_db, self.sample_buffer)
self._write_buffer(self.train_label_db, self.label_buffer)
else:
self._write_buffer(self.test_db, self.sample_buffer)
self._write_buffer(self.test_label_db, self.label_buffer)
self._clean_buffers()
def close(self):
print("Closing database")
self.db.close()
def create_hdf5_datasets(self):
'''
create train and test datasets
'''
ROWS, COLS = self.sample_shape
self.train_label_db = self.db.create_dataset("trainlabels",(self.train_length,),maxshape = None, dtype ="int")
self.test_label_db = self.db.create_dataset("testlabels",(self.test_length,),maxshape = None, dtype = "int")
self.train_db = self.db.create_dataset("trainsamples", shape = (self.train_length, ROWS, COLS), dtype = "float")
self.test_db = self.db.create_dataset("testsamples", shape = (self.test_length, ROWS, COLS), dtype = "float")
def add(self, label, sample):
'''
Add samples to buffer. Write buffer to hdf5 when full
'''
self.sample_buffer.append(sample)
self.label_buffer.append(label)
if(len(self.sample_buffer)==self.buffer_size):
if(self.datasettype == 'train'):
self._write_buffer(self.train_db,self.sample_buffer)
self._write_buffer(self.train_label_db, self.label_buffer)
else:
print('Write buffer............')
print(self.label_buffer)
self._write_buffer(self.test_db, self.sample_buffer)
self._write_buffer(self.test_label_db, self.label_buffer)
self.idxs['index']= self.idxs['index']+ len(self.label_buffer)
self._clean_buffers()
def _write_buffer(self, dataset, buf):
'''
write samples to buffer
'''
print("Writing Buffer {}".format(dataset))
start = self.idxs['index']
end = start+ len(buf)
dataset[start:end] = buf
def _clean_buffers(self):
'''
clear buffers
'''
self.sample_buffer =[]
self.label_buffer =[]
def getlabel(label):
'''
Equivalent class labels based on datafolder where sample is pickeddd
'''
if(label=="Folder1"):
return 0
elif(label == "Folder2"):
return 1
else:
return 2
def builddatabase(datasamples, databasepath):
'''
create hdf5 database
Input:
list of paths to datasamples in csv, database path
'''
features= [2,3,4,5,6,9,29,32,33,34,35,37,38,42,43,44,45,46]
timelength = 256
trainsplit = int(len(datasamples) * 0.75)
traindatasamples = datasamples[:trainsplit]
testdatasamples = datasamples[trainsplit:]
dataset = H5Database(databasepath,(timelength,len(features)),
25, len(traindatasamples),
len(testdatasamples))
train_mean = np.zeros((1,len(features)),dtype=float)
sample_count =0
features_sum = np.zeros((1,len(features)),dtype=float)
print('######## Creating Train Dataset #########')
dataset.initialize_dataset("train")
for index,sample in enumerate(traindatasamples):
print('Working on file:{} Complete : {:.2f}'.format(sample, 100.*index/len(traindatasamples)))
try:
data = np.genfromtxt(sample, delimiter=",")[-timelength:,features]
features_sum+=data.sum(0)
sample_count+=data.shape[0]
train_mean = features_sum/(sample_count)
labelname = sample.split("/")[2]
label = getlabel(labelname)
dataset.add(label,data)
except:
print('Exception in processing : {}'.format(sample))
continue
np.save('train_mean', train_mean)
dataset.flushbuffers()
print('######## Creating Test Dataset #########')
dataset.initialize_dataset("test")
for index,sample in enumerate(testdatasamples):
print('Working on file:{} Complete : {:.2f}'.format(sample, 100.*index/len(testdatasamples)))
try:
data = np.genfromtxt(sample, delimiter=",")[-timelength:,features]
labelname = sample.split("/")[2]
label = getlabel(labelname)
dataset.add(label,data)
except:
print('Exception in processing : {}'.format(sample))
continue
dataset.flushbuffers()
dataset.close()
if __name__=='__main__':
datasamples =[]
database = 'mydatabase.h5'
rootpath = './dataset'
datapaths = os.listdir('./dataset')
for datapath in datapaths:
currentpath = os.path.join(rootpath, datapath)
for sample in os.listdir(currentpath):
samplepath = os.path.join(currentpath,sample)
datasamples.append(samplepath)
shuffle(datasamples)
builddatabase(datasamples, database)