In [1]:
import numpy as np
import pandas as pd
import random
from random import shuffle

In [2]:
# we will create sensor labels by shifting values for each 4 hours window (48 time steps) starting t-30 mins measurements (total 270 mins)
#  i.e 54th time steps is first label, will drop earlier measurement from label set

drop_t=53; x_window=48
dir='/home/grads/s/sharaj/sensor_proj/data/'

# function for creating data sample and corresponding labels # we won't have label for last 53 time steps:

def create_x_y(file_name):
    # sensor_data = np.loadtxt(dir+str(file_name)+'.csv')  
    sensor_data=pd.read_csv(dir+'data_csvs/'+str(file_name)+'.csv', names=['sensor_read'])
    # convert character entries such as 'LOW' to NaN
    sensor_data['sensor_read'] = pd.to_numeric(sensor_data.sensor_read.astype(str).str.replace(',',''), errors='coerce')
    # Replace NaN entries by previous / next values
    sensor_data=sensor_data.bfill().ffill()
    sensor_data = np.squeeze(sensor_data.values) #extract data      
    sensor_label= sensor_data[drop_t:] # extract labels
    # Now we will create a sample of 4 hours  (48 time steps) measurement each
    x_matrix= np.empty([0,x_window])
    for i in range(sensor_label.shape[0]):
        x_matrix = np.vstack((x_matrix,sensor_data[i:i+x_window]))
    return x_matrix, sensor_label
    

In [3]:
# Creating Train and test sets across sensor files using randon state

train_set_files=90

def train_test_split(Random_state):
    random.seed(Random_state)
    indx= [i for i in range(1,114)]
    shuffle(indx)

    x_train=np.empty([0,x_window]); y_train=np.empty([0,1])
    for f in indx[:train_set_files]:
        x, y = create_x_y(f)
        x_train = np.vstack((x_train,x))
        y_train = np.concatenate((y_train,y.reshape(-1,1)))
        
    x_test=np.empty([0,x_window]); y_test=np.empty([0,1])
    for f in indx[train_set_files:]:
        x, y = create_x_y(f)
        x_test = np.vstack((x_test,x))
        y_test = np.concatenate((y_test,y.reshape(-1,1)))
    
    return x_train, y_train, x_test, y_test
    


In [4]:


x_train, y_train, x_test, y_test=train_test_split(10)
seed='old_check'
np.save(dir+'x_train_'+str(seed)+'.npy',x_train)
np.save(dir+'y_train_'+str(seed)+'.npy',y_train)
np.save(dir+'x_test_'+str(seed)+'.npy',x_test)
np.save(dir+'y_test_'+str(seed)+'.npy',y_test)

In [20]:
#creating multiple train and test sets across sensor measurement
n_trials=5
seed_indx= [(randint(1, 100), randint(1, 100)) for i in range(n_trials)]
for seed in seed_indx:
    x_train, y_train, x_test, y_test=train_test_split(seed)
    np.save(dir+'x_train_'+str(seed)+'.npy',x_train)
    np.save(dir+'y_train_'+str(seed)+'.npy',y_train)
    np.save(dir+'x_test_'+str(seed)+'.npy',x_test)
    np.save(dir+'y_test_'+str(seed)+'.npy',y_test)
    print (x_train.shape)
    print (y_train.shape)
    print (x_test.shape)
    print (y_test.shape)


(1564103, 48)
(1564103, 1)
(383793, 48)
(383793, 1)
(1573979, 48)
(1573979, 1)
(373917, 48)
(373917, 1)
(1579303, 48)
(1579303, 1)
(368593, 48)
(368593, 1)
(1562669, 48)
(1562669, 1)
(385227, 48)
(385227, 1)
