In [15]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import lightgbm as lgb

class UJIDataset(torch.utils.data.Dataset):
    def __init__(self, root, train = True, transform = None, target_transform = None, download = False):
        self.root = root
        dir_path = self.root + '/UJIndoorLoc'
        zip_path = self.root + '/uji_uil.zip'
        dataset_training_file = dir_path + '/trainingData.csv'
        dataset_validation_file = dir_path + '/validationData.csv'
        # Load independent variables (WAPs values)
        if train:
            dataset_file = dataset_training_file
        else:
            dataset_file = dataset_validation_file
        file = open(dataset_file, 'r')
        # Load labels
        label = file.readline()
        label = label.split(',')
        # Load independent variables
        file_load = np.loadtxt(file, delimiter = ',', skiprows = 1)
        #file_load_label = np.loadtxt(file, delimiter = ',')
        #data = np.genfromtxt(file, dtype = float, delimiter = ',', names = True)
        # RSSI values
        self.x = file_load[:, 0 : 520]
        # Load dependent variables
        self.y = file_load[:, 520 : 524]
        # Divide labels into x and y
        self.x_label = label[0 : 520]
        self.x_label = np.concatenate([self.x_label, label[524: 529]])
        self.y_label = label[520 : 524]
        # Regularization of independent variables
        self.x[self.x == 100] = np.nan    # WAP not detected
        self.x = self.x + 104             # Convert into positive values
        self.x = self.x / 104             # Regularize into scale between 0 and 1
        # Building ID, Space ID, Relative Position, User ID, Phone ID and Timestamp respectively
        self.x = np.concatenate([self.x, file_load[:, 524 : 529]], axis = 1)
        file.close()
        # Reduce the number of dependent variables by combining building number and floor into one variable: area
        self.area = self.y[:, 3] * 5 + self.y[:, 2]
    def to_tensor(self):
        self.x = torch.from_numpy(self.x).float()
        self.y = torch.from_numpy(self.y).float()
        self.area = torch.from_numpy(self.area).float()
    def nan_to_zero(self):
        self.x = np.nan_to_num(self.x)
    # Return the target instance (row)
    def __getitem__(self, index_row):
        return self.x[index_row, :], self.y[index_row, :]
    # Return the number of instances (the number of rows)
    def __len__(self, dim = 0):
        return int(self.x.size()[0])

Import dataset with lightgbm

In [3]:
a = torch.zeros([2,2])
print(a)
b = torch.ones([2,2])
c = torch.cat((a, b), dim = 1)
print(c)

tensor([[0., 0.],
        [0., 0.]])
tensor([[0., 0., 1., 1.],
        [0., 0., 1., 1.]])


In [16]:
# Load training dataset
dataset_train = UJIDataset('./data', train = True)
dataset_validate = UJIDataset('./data', train = False)
#gb_train_data = lgb.Dataset(

In [4]:
print(dataset_train.x_label)

['WAP001' 'WAP002' 'WAP003' 'WAP004' 'WAP005' 'WAP006' 'WAP007' 'WAP008'
 'WAP009' 'WAP010' 'WAP011' 'WAP012' 'WAP013' 'WAP014' 'WAP015' 'WAP016'
 'WAP017' 'WAP018' 'WAP019' 'WAP020' 'WAP021' 'WAP022' 'WAP023' 'WAP024'
 'WAP025' 'WAP026' 'WAP027' 'WAP028' 'WAP029' 'WAP030' 'WAP031' 'WAP032'
 'WAP033' 'WAP034' 'WAP035' 'WAP036' 'WAP037' 'WAP038' 'WAP039' 'WAP040'
 'WAP041' 'WAP042' 'WAP043' 'WAP044' 'WAP045' 'WAP046' 'WAP047' 'WAP048'
 'WAP049' 'WAP050' 'WAP051' 'WAP052' 'WAP053' 'WAP054' 'WAP055' 'WAP056'
 'WAP057' 'WAP058' 'WAP059' 'WAP060' 'WAP061' 'WAP062' 'WAP063' 'WAP064'
 'WAP065' 'WAP066' 'WAP067' 'WAP068' 'WAP069' 'WAP070' 'WAP071' 'WAP072'
 'WAP073' 'WAP074' 'WAP075' 'WAP076' 'WAP077' 'WAP078' 'WAP079' 'WAP080'
 'WAP081' 'WAP082' 'WAP083' 'WAP084' 'WAP085' 'WAP086' 'WAP087' 'WAP088'
 'WAP089' 'WAP090' 'WAP091' 'WAP092' 'WAP093' 'WAP094' 'WAP095' 'WAP096'
 'WAP097' 'WAP098' 'WAP099' 'WAP100' 'WAP101' 'WAP102' 'WAP103' 'WAP104'
 'WAP105' 'WAP106' 'WAP107' 'WAP108' 'WAP109' 'WAP1

In [11]:
dataset_train.nan_to_zero()
dataset_validate.nan_to_zero()
dataset_train.to_tensor()
dataset_validate.to_tensor()

In [17]:
print(dataset_train.y)

[[-7.53662120e+03  4.86493423e+06  2.00000000e+00  1.00000000e+00]
 [-7.51915240e+03  4.86494953e+06  2.00000000e+00  1.00000000e+00]
 [-7.52457040e+03  4.86493409e+06  2.00000000e+00  1.00000000e+00]
 ...
 [-7.51684150e+03  4.86488929e+06  3.00000000e+00  1.00000000e+00]
 [-7.53732190e+03  4.86489578e+06  3.00000000e+00  1.00000000e+00]
 [-7.53616580e+03  4.86489786e+06  3.00000000e+00  1.00000000e+00]]


In [18]:
x_train, x_validate, y_train, y_validate = sklearn.model_selection.train_test_split(dataset_train.x, dataset_train.y, test_size = 0.2, random_state = 42)