In [1]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import lightgbm as lgb

class UJIDataset(torch.utils.data.Dataset):
    def __init__(self, root, train = True, transform = None, target_transform = None, download = False):
        self.root = root
        dir_path = self.root + '/UJIndoorLoc'
        zip_path = self.root + '/uji_uil.zip'
        dataset_training_file = dir_path + '/trainingData.csv'
        dataset_validation_file = dir_path + '/validationData.csv'
        # Load independent variables (WAPs values)
        if train:
            dataset_file = dataset_training_file
        else:
            dataset_file = dataset_validation_file
        file = open(dataset_file, 'r')
        # Load labels
        label = file.readline()
        label = label.split(',')
        # Load independent variables
        file_load = np.loadtxt(file, delimiter = ',', skiprows = 1)
        #file_load_label = np.loadtxt(file, delimiter = ',')
        #data = np.genfromtxt(file, dtype = float, delimiter = ',', names = True)
        # RSSI values
        self.x = file_load[:, 0 : 520]
        # Load dependent variables
        self.y = file_load[:, 520 : 524]
        # Divide labels into x and y
        self.x_label = label[0 : 520]
        self.x_label = np.concatenate([self.x_label, label[524: 529]])
        self.y_label = label[520 : 524]
        # Regularization of independent variables
        self.x[self.x == 100] = np.nan    # WAP not detected
        self.x = self.x + 104             # Convert into positive values
        self.x = self.x / 104             # Regularize into scale between 0 and 1
        # Building ID, Space ID, Relative Position, User ID, Phone ID and Timestamp respectively
        self.x = np.concatenate([self.x, file_load[:, 524 : 529]], axis = 1)
        file.close()
        # Reduce the number of dependent variables by combining building number and floor into one variable: area
        area = self.y[:, 3] * 5 + self.y[:, 2]
        self.y = np.column_stack((self.y, area))
    def to_tensor(self):
        self.x = torch.from_numpy(self.x).float()
        self.y = torch.from_numpy(self.y).float()
        self.area = torch.from_numpy(self.area).float()
    def nan_to_zero(self):
        self.x = np.nan_to_num(self.x)
    # Return the target instance (row)
    def __getitem__(self, index_row):
        return self.x[index_row, :], self.y[index_row, :]
    # Return the number of instances (the number of rows)
    def __len__(self, dim = 0):
        return int(self.x.size()[0])

In [2]:
# Calculate Euclidean distance (unit: meter) between two coordinates in EPSG:3857 
def euclidean_distance(latitude_1, longitude_1, latitude_2, longitude_2):
    return np.sqrt((latitude_1 - latitude_2)**2 + (longitude_1 - longitude_2)**2)

Import dataset with lightgbm

In [3]:
a = torch.zeros([2,2])
print(a)
b = torch.ones([2,2])
c = torch.cat((a, b), dim = 1)
print(c)

tensor([[0., 0.],
        [0., 0.]])
tensor([[0., 0., 1., 1.],
        [0., 0., 1., 1.]])


In [4]:
# Load training dataset
dataset_train = UJIDataset('./data', train = True)
dataset_test = UJIDataset('./data', train = False)
#gb_train_data = lgb.Dataset(

In [5]:
#dataset_train.nan_to_zero()
#dataset_validate.nan_to_zero()
#dataset_train.to_tensor()
#dataset_validate.to_tensor()


In [6]:
print(dataset_train.y)

[[-7.53662120e+03  4.86493423e+06  2.00000000e+00  1.00000000e+00
   7.00000000e+00]
 [-7.51915240e+03  4.86494953e+06  2.00000000e+00  1.00000000e+00
   7.00000000e+00]
 [-7.52457040e+03  4.86493409e+06  2.00000000e+00  1.00000000e+00
   7.00000000e+00]
 ...
 [-7.51684150e+03  4.86488929e+06  3.00000000e+00  1.00000000e+00
   8.00000000e+00]
 [-7.53732190e+03  4.86489578e+06  3.00000000e+00  1.00000000e+00
   8.00000000e+00]
 [-7.53616580e+03  4.86489786e+06  3.00000000e+00  1.00000000e+00
   8.00000000e+00]]


In [7]:
x_train, x_validate, y_train, y_validate = sklearn.model_selection.train_test_split(dataset_train.x, dataset_train.y, test_size = 0.2, random_state = 42)

In [8]:
print(y_train)

[[-7.68013460e+03  4.86493149e+06  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [-7.37280290e+03  4.86484930e+06  4.00000000e+00  2.00000000e+00
   1.40000000e+01]
 [-7.37707827e+03  4.86484296e+06  0.00000000e+00  2.00000000e+00
   1.00000000e+01]
 ...
 [-7.40199620e+03  4.86479002e+06  0.00000000e+00  2.00000000e+00
   1.00000000e+01]
 [-7.36838610e+03  4.86476941e+06  3.00000000e+00  2.00000000e+00
   1.30000000e+01]
 [-7.64097620e+03  4.86501066e+06  0.00000000e+00  0.00000000e+00
   0.00000000e+00]]


In [9]:
dataset_train_reg_long_lgb = lgb.Dataset(x_train, label = y_train[:, 0])
dataset_validate_reg_long_lgb = lgb.Dataset(x_validate, label = y_validate[:, 0])
dataset_train_reg_lat_lgb = lgb.Dataset(x_train, label = y_train[:, 1])
dataset_validate_reg_lat_lgb = lgb.Dataset(x_validate, label = y_validate[:, 1])
dataset_train_cat_floor_lgb = lgb.Dataset(x_train, label = y_train[:, 2])
dataset_validate_cat_floor_lgb = lgb.Dataset(x_validate, label = y_validate[:, 2])
dataset_train_cat_building_lgb = lgb.Dataset(x_train, label = y_train[:, 3])
dataset_validate_cat_building_lgb = lgb.Dataset(x_validate, label = y_validate[:, 3])
dataset_train_cat_area_lgb = lgb.Dataset(x_train, label = y_train[:, 4])
dataset_validate_cat_area_lgb = lgb.Dataset(x_validate, label = y_validate[:, 4])

In [29]:
params_reg = {'learning_rate': 0.001,
              'num_boost_round': 1000,
              'max_depth': -1,
              'boosting': 'rf', 
              'objective': 'regression', 
              'metric': 'rmse', 
              #'is_training_metric': True, 
              #'num_leaves': 144,  
              #'feature_fraction': 0.9, 
              'bagging_fraction': 0.9, 
              'bagging_freq': 5,
              #'early_stopping_round': 100,
              #'seed':2018
             }
params_cat_building = {'learning_rate': 0.002,
              'num_boost_round': 500,
              'max_depth': 16,
              'boosting': 'rf',
              'objective': 'multiclass',
              'num_class': 3,
              'metric': 'multi_logloss',
              #'is_training_metric': True,
              'num_leaves': 144,
              #'feature_fraction': 0.97,
              'bagging_fraction': 0.7,
              'bagging_freq': 5,
              #'early_stopping_round': 100,
             }
params_cat_floor = {'learning_rate': 0.002,
              'num_boost_round': 500,
              'max_depth': 16,
              'boosting': 'rf',
              'objective': 'multiclass',
              'num_class': 5,
              'metric': 'multi_logloss',
              #'is_training_metric': True,
              'num_leaves': 144,
              #'feature_fraction': 0.97,
              'bagging_fraction': 0.7,
              'bagging_freq': 5,
              #'early_stopping_round': 100,
             }
params_cat_area = {'learning_rate': 0.002,
              'num_boost_round': 500,
              'max_depth': 16,
              'boosting': 'rf',
              'objective': 'multiclass',
              'num_class': 15,
              'metric': 'multi_logloss',
              #'is_training_metric': True,
              'num_leaves': 144,
              #'feature_fraction': 0.97,
              'bagging_fraction': 0.7,
              'bagging_freq': 5,
              #'early_stopping_round': 100,
             }

In [30]:
# feval(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
#def multiclass_score(preds, train_data):
#    label = train_data.get_label()
#    return 'multiclass_score', np.mean(label == preds), True

#def regression_score(preds, train_data):
#    label = train_data.get_label()
#    return 'regression_score', np.mean(euclidean_distance(label, 0, preds, 0)), False

In [31]:
model_reg_long_lgb = lgb.train(params_reg, dataset_train_reg_long_lgb, dataset_validate_reg_long_lgb, verbose_eval = 100)# early_stopping_rounds = 100)
model_reg_lat_lgb = lgb.train(params_reg, dataset_train_reg_lat_lgb, dataset_validate_reg_lat_lgb, verbose_eval = 100)#, early_stopping_rounds = 100)
model_cat_floor_lgb = lgb.train(params_cat_floor, dataset_train_cat_floor_lgb, dataset_validate_cat_floor_lgb, verbose_eval = 100)#, early_stopping_rounds = 100)
model_cat_building_lgb = lgb.train(params_cat_building, dataset_train_cat_building_lgb, dataset_validate_cat_building_lgb, verbose_eval = 100)#, early_stopping_rounds = 100)
model_cat_area_lgb = lgb.train(params_cat_area, dataset_train_cat_area_lgb, dataset_validate_cat_area_lgb, verbose_eval = 100)#, early_stopping_rounds = 100)

In [32]:
predict_long_train = model_reg_long_lgb.predict(x_train)
predict_long_validate = model_reg_long_lgb.predict(x_validate)
predict_lat_train = model_reg_lat_lgb.predict(x_train)
predict_lat_validate = model_reg_lat_lgb.predict(x_validate)
predict_floor_train = model_cat_floor_lgb.predict(x_train)
predict_floor_validate = model_cat_floor_lgb.predict(x_validate)
predict_building_train = model_cat_building_lgb.predict(x_train)
predict_building_validate = model_cat_building_lgb.predict(x_validate)
predict_area_train = model_cat_area_lgb.predict(x_train)
predict_area_validate = model_cat_area_lgb.predict(x_validate)

In [33]:
#predict_long_test = model_reg_long_lgb.predict(dataset_test.x)
#predict_lat_test = model_reg_lat_lgb.predict(dataset_test.x)
#predict_floor_test = model_cat_floor_lgb.predict(dataset_test.x)
#predict_building_test = model_cat_building_lgb.predict(dataset_test.x)
#predict_area_test = model_cat_area_lgb.predict(dataset_test.x)

In [34]:
#mse_long = sklearn.metrics.mean_squared_error(y_validate[:, 0], predict_long_test)
#mse_lat = sklearn.metrics.mean_squared_error(y_validate[:, 1], predict_lat_test)
#mse_floor = sklearn.metrics.mean_squared_error(y_validate[:, 2], predict_floor_test)
#mse_building = sklearn.metrics.mean_squared_error(y_validate[:, 3], predict_building_test)

In [35]:
print(y_train[:, 1])
print(predict_lat_train)

[4864931.4921   4864849.3028   4864842.961439 ... 4864790.0184
 4864769.4062   4865010.6595  ]
[4864932.74729769 4864836.59820129 4864835.67615176 ... 4864843.17548275
 4864777.75223911 4864997.52198648]


In [36]:
print(y_train[:, 0])
print(predict_long_train)
print(y_validate[:, 0])
print(predict_long_validate)
print(y_train[:, 2])
print(predict_floor_train.argmax(axis = 1))
print(y_validate[:, 2])
print(predict_floor_validate.argmax(axis = 1))

[-7680.1346     -7372.8029     -7377.07826999 ... -7401.9962
 -7368.3861     -7640.9762    ]
[-7650.22033727 -7373.9548471  -7371.8573325  ... -7396.09189039
 -7355.18673302 -7637.50597817]
[-7511.5215     -7535.39365217 -7348.8982     ... -7408.69525072
 -7445.55787384 -7390.7612    ]
[-7513.98022784 -7527.57676579 -7351.12691394 ... -7425.06455527
 -7425.06455527 -7373.70904444]
[0. 4. 0. ... 0. 3. 0.]
[0 4 0 ... 0 3 0]
[2. 2. 3. ... 2. 0. 3.]
[2 2 3 ... 2 0 3]


In [37]:
print(len(predict_floor_validate))

3988


In [38]:
error_building_train = []
error_building_validate = []
error_floor_train = []
error_floor_validate = []
error_area_train = []
error_area_validate = []
predict_floor_argmax_train = predict_floor_train.argmax(axis = 1)
predict_floor_argmax_validate = predict_floor_validate.argmax(axis = 1)
predict_building_argmax_train = predict_building_train.argmax(axis = 1)
predict_building_argmax_validate = predict_building_validate.argmax(axis = 1)
predict_area_argmax_train = predict_area_train.argmax(axis = 1)
predict_area_argmax_validate = predict_area_validate.argmax(axis = 1)
for i in range(len(predict_floor_train)):
    if predict_floor_argmax_train[i] != y_train[i, 2]:
        error_floor_train.append(i)
    if predict_building_argmax_train[i] != y_train[i, 3]:
        error_building_train.append(i)
    if predict_area_argmax_train[i] != y_train[i, 4]:
        error_area_train.append(i)
for i in range(len(predict_floor_validate)):
    if predict_floor_argmax_validate[i] != y_validate[i, 2]:
        error_floor_validate.append(i)
    if predict_building_argmax_validate[i] != y_validate[i, 3]:
        error_building_validate.append(i)
    if predict_area_argmax_validate[i] != y_validate[i, 4]:
        error_area_validate.append(i)

In [39]:
error_rate_floor_train = len(error_floor_train) / len(predict_floor_train)
print('error_rate_floor_train', error_rate_floor_train)
error_rate_floor_validate = len(error_floor_validate) / len(predict_floor_train)
print('error_rate_floor_validate', error_rate_floor_validate)
error_rate_building_train = len(error_building_train) / len(predict_building_train)
print('error_rate_building_train', error_rate_building_train)
error_rate_building_validate = len(error_building_validate) / len(predict_building_train)
print('error_rate_buildilng_validate', error_rate_building_validate)
error_rate_area_train = len(error_area_train) / len(predict_area_train)
print('error_rate_area_train', error_rate_area_train)
error_rate_area_validate = len(error_area_validate) / len(predict_area_validate)
print('error_rate_area_validate', error_rate_area_validate)

error_rate_floor_train 0.014609982442939554
error_rate_floor_validate 0.004201153749686481
error_rate_building_train 0.0008151492350137948
error_rate_buildilng_validate 0.0006270378730875344
error_rate_area_train 0.0218209179834462
error_rate_area_validate 0.03009027081243731


In [40]:
print(len(predict_long_train))

15948


In [41]:
error_distance_train = []
error_distance_validate = []
for i in range(len(predict_long_train)):
    error_distance_train.append(euclidean_distance(predict_lat_train[i], predict_long_train[i], y_train[i, 1], y_train[i, 0]))
for i in range(len(predict_long_validate)):
    error_distance_validate.append(euclidean_distance(predict_lat_validate[i], predict_long_validate[i], y_validate[i, 1], y_validate[i, 0]))
error_mean_distance_train = np.mean(np.stack(error_distance_train)).item()
error_max_distance_train = np.max(np.stack(error_distance_train)).item()
error_min_distance_train = np.min(np.stack(error_distance_train)).item()
error_std_distance_train = np.std(np.stack(error_distance_train)).item()
error_var_distance_train = np.var(np.stack(error_distance_train)).item()
print('error_mean_distance_train: ', error_mean_distance_train)
print('error_max_distance_train: ', error_max_distance_train)
print('error_min_distance_train: ', error_min_distance_train)
print('error_std_distance_train: ', error_std_distance_train)
print('error_var_distance_train: ', error_var_distance_train)
error_mean_distance_validate = np.mean(np.stack(error_distance_validate)).item()
error_max_distance_validate = np.max(np.stack(error_distance_validate)).item()
error_min_distance_validate = np.min(np.stack(error_distance_validate)).item()
error_std_distance_validate = np.std(np.stack(error_distance_validate)).item()
error_var_distance_validate = np.var(np.stack(error_distance_validate)).item()
print('error_mean_distance_validate: ', error_mean_distance_validate)
print('error_max_distance_validate: ', error_max_distance_validate)
print('error_min_distance_validate: ', error_min_distance_validate)
print('error_std_distance_validate: ', error_std_distance_validate)
print('error_var_distance_validate: ', error_var_distance_validate)

error_mean_distance_train:  14.419452822140073
error_max_distance_train:  159.48982321538642
error_min_distance_train:  0.33754468490026307
error_std_distance_train:  10.925451303028265
error_var_distance_train:  119.365486174842
error_mean_distance_validate:  14.64772223117856
error_max_distance_validate:  159.48982321538642
error_min_distance_validate:  0.37801860729130154
error_std_distance_validate:  11.413470507630771
error_var_distance_validate:  130.2673090285574
