In [1]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import lightgbm as lgb

class UJIDataset(torch.utils.data.Dataset):
    def __init__(self, root, train = True, transform = None, target_transform = None, download = False):
        self.root = root
        dir_path = self.root + '/UJIndoorLoc'
        zip_path = self.root + '/uji_uil.zip'
        dataset_training_file = dir_path + '/trainingData.csv'
        dataset_validation_file = dir_path + '/validationData.csv'
        # Load independent variables (WAPs values)
        if train:
            dataset_file = dataset_training_file
        else:
            dataset_file = dataset_validation_file
        file = open(dataset_file, 'r')
        # Load labels
        label = file.readline()
        label = label.split(',')
        # Load independent variables
        file_load = np.loadtxt(file, delimiter = ',', skiprows = 1)
        #file_load_label = np.loadtxt(file, delimiter = ',')
        #data = np.genfromtxt(file, dtype = float, delimiter = ',', names = True)
        # RSSI values
        self.x = file_load[:, 0 : 520]
        # Load dependent variables
        self.y = file_load[:, 520 : 524]
        # Divide labels into x and y
        self.x_label = label[0 : 520]
        self.x_label = np.concatenate([self.x_label, label[524: 529]])
        self.y_label = label[520 : 524]
        # Regularization of independent variables
        self.x[self.x == 100] = np.nan    # WAP not detected
        self.x = self.x + 104             # Convert into positive values
        self.x = self.x / 104             # Regularize into scale between 0 and 1
        # Building ID, Space ID, Relative Position, User ID, Phone ID and Timestamp respectively
        self.x = np.concatenate([self.x, file_load[:, 524 : 529]], axis = 1)
        file.close()
        # Reduce the number of dependent variables by combining building number and floor into one variable: area
        area = self.y[:, 3] * 5 + self.y[:, 2]
        self.y = np.column_stack((self.y, area))
    def to_tensor(self):
        self.x = torch.from_numpy(self.x).float()
        self.y = torch.from_numpy(self.y).float()
        self.area = torch.from_numpy(self.area).float()
    def nan_to_zero(self):
        self.x = np.nan_to_num(self.x)
    # Return the target instance (row)
    def __getitem__(self, index_row):
        return self.x[index_row, :], self.y[index_row, :]
    # Return the number of instances (the number of rows)
    def __len__(self, dim = 0):
        return int(self.x.size()[0])

In [2]:
# Calculate Euclidean distance (unit: meter) between two coordinates in EPSG:3857 
def euclidean_distance(latitude_1, longitude_1, latitude_2, longitude_2):
    return np.sqrt((latitude_1 - latitude_2)**2 + (longitude_1 - longitude_2)**2)

In [3]:
# Load training dataset
dataset_train = UJIDataset('./data', train = True)
dataset_test = UJIDataset('./data', train = False)
#gb_train_data = lgb.Dataset(

In [4]:
x_train, x_validate, y_train, y_validate = sklearn.model_selection.train_test_split(dataset_train.x, dataset_train.y, test_size = 0.2, random_state = 42)
x_test1, x_test2, y_test1, y_test2 = sklearn.model_selection.train_test_split(dataset_test.x, dataset_test.y, test_size = 0.5, random_state = 42)

In [5]:
dataset_train_reg_long_lgb = lgb.Dataset(x_train, label = y_train[:, 0])
dataset_validate_reg_long_lgb = lgb.Dataset(x_validate, label = y_validate[:, 0])
dataset_train_reg_lat_lgb = lgb.Dataset(x_train, label = y_train[:, 1])
dataset_validate_reg_lat_lgb = lgb.Dataset(x_validate, label = y_validate[:, 1])
dataset_train_cat_floor_lgb = lgb.Dataset(x_train, label = y_train[:, 2])
dataset_validate_cat_floor_lgb = lgb.Dataset(x_validate, label = y_validate[:, 2])
dataset_train_cat_building_lgb = lgb.Dataset(x_train, label = y_train[:, 3])
dataset_validate_cat_building_lgb = lgb.Dataset(x_validate, label = y_validate[:, 3])
dataset_train_cat_area_lgb = lgb.Dataset(x_train, label = y_train[:, 4])
dataset_validate_cat_area_lgb = lgb.Dataset(x_validate, label = y_validate[:, 4])

In [6]:
params_lgbr = {
              'boosting_type': 'rf',
              'bagging_freq': 5,
              'bagging_fraction': 0.8,
              'num_leaves': 1440,  
              'learning_rate': 0.001,
              'max_depth': -1,
              'objective': 'regression'
              #'seed':2018
              }

params_lgbr_long_fit = {'eval_set': [(x_validate, y_validate[:, 0])],
                        'eval_names': ['evalset_long'],
                        'eval_metric': ['rmse'],
                        #'early_stoppping_rounds': [100],
                       }

params_lgbr_lat_fit = {'eval_set': [(x_validate, y_validate[:, 1])],
                        'eval_names': ['evalset_lat'],
                        'eval_metric': ['rmse'],
                        #'early_stoppping_rounds': [100],
                       }

params_lgbc = {'boosting_type': 'rf',
               'bagging_freq': 5,
               'bagging_fraction': 0.8,
               'num_leaves': 144,  
               'learning_rate': 0.002,
               'max_depth': -1,
               'objective': 'multiclass'
              }

params_lgbc_floor_fit = {'eval_set': [(x_validate, y_validate[:, 2])],
                         'eval_names': ['evalset_floor'],
                         'eval_metric': ['multi_logloss'],
                         #'early_stopping_rounds': [100],
                        }

params_lgbc_building_fit = {'eval_set': [(x_validate, y_validate[:, 3])],
                            'eval_names': ['evalset_building'],
                            'eval_metric': ['multi_logloss'],
                            #'early_stoppping_rounds': [100],
                           }



params_lgbc_area_fit = {'eval_set': [(x_validate, y_validate[:, 4])],
                        'eval_names': ['evalset_area'],
                        'eval_metric': ['multi_logloss'],
                        #'early_stopping_rounds': [100],
                       }

In [18]:
print(y_validate[:,4])

[ 7.  7. 13. ...  7.  5. 13.]


In [16]:
lgbr_long = lgb.LGBMRegressor(**params_lgbr)
lgbr_lat = lgb.LGBMRegressor(**params_lgbr)
lgbc_floor = lgb.LGBMClassifier(**params_lgbc)
lgbc_building = lgb.LGBMClassifier(**params_lgbc)
lgbc_area = lgb.LGBMClassifier(**params_lgbc)

model_reg_long_lgb_fit = lgbr_long.fit(X = x_train, y = y_train[:, 0], **params_lgbr_long_fit, early_stopping_rounds = 100)
model_reg_lat_lgb_fit = lgbr_lat.fit(X = x_train, y = y_train[:, 1], **params_lgbr_lat_fit, early_stopping_rounds = 100)
model_cat_floor_lgb_fit = lgbc_floor.fit(X = x_train, y = y_train[:, 2], **params_lgbc_floor_fit, early_stopping_rounds = 100)
model_cat_building_lgb_fit = lgbc_building.fit(X = x_train, y = y_train[:, 3], **params_lgbc_building_fit, early_stopping_rounds = 100)
model_cat_area_lgb_fit = lgbc_area.fit(X = x_train, y = y_train[:, 4], **params_lgbc_area_fit, early_stopping_rounds = 100)

[1]	evalset_long's rmse: 7.74875	evalset_long's l2: 60.0432
Training until validation scores don't improve for 100 rounds
[2]	evalset_long's rmse: 7.74875	evalset_long's l2: 60.0432
[3]	evalset_long's rmse: 7.74875	evalset_long's l2: 60.0432
[4]	evalset_long's rmse: 7.74875	evalset_long's l2: 60.0432
[5]	evalset_long's rmse: 7.74875	evalset_long's l2: 60.0432
[6]	evalset_long's rmse: 7.10206	evalset_long's l2: 50.4393
[7]	evalset_long's rmse: 6.78125	evalset_long's l2: 45.9853
[8]	evalset_long's rmse: 6.63028	evalset_long's l2: 43.9606
[9]	evalset_long's rmse: 6.57005	evalset_long's l2: 43.1655
[10]	evalset_long's rmse: 6.55901	evalset_long's l2: 43.0207
[11]	evalset_long's rmse: 6.41496	evalset_long's l2: 41.1517
[12]	evalset_long's rmse: 6.34208	evalset_long's l2: 40.2219
[13]	evalset_long's rmse: 6.31513	evalset_long's l2: 39.8809
[14]	evalset_long's rmse: 6.31773	evalset_long's l2: 39.9137
[15]	evalset_long's rmse: 6.33916	evalset_long's l2: 40.185
[16]	evalset_long's rmse: 6.24881

ValueError: y contains previously unseen labels: [4, 9]

In [15]:
predict_long_test1_fit = model_reg_long_lgb_fit.predict(x_test1)
predict_lat_test1_fit = model_reg_lat_lgb_fit.predict(x_test1)
predict_floor_test1_fit = model_cat_floor_lgb_fit.predict(x_test1)
predict_building_test1_fit = model_cat_building_lgb_fit.predict(x_test1)
predict_area_test1_fit = model_cat_area_lgb_fit.predict(x_test1)

In [13]:
print(predict_building_test1_fit)

[ 7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.
  7.  7.  7.  7.  7.  7.  8.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.
  7.  7.  7.  7.  7.  7.  7.  7.  7. 14.  7.  7.  7.  7. 14.  7.  7.  7.
  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.
  7.  7.  7.  7.  7. 14.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.
  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.
  7.  7.  0.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.
  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  0.  7.  7.  7.  7.  7.
  7.  7.  8.  7.  7.  7.  7.  7.  7.  7.  7.  7. 14. 14.  3.  7.  7.  7.
  7.  7.  7. 14.  7.  7.  0.  7.  7.  7.  7.  7.  7. 14.  7.  8.  7.  7.
  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.
  7.  8.  7.  7.  7.  7. 14.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  8.
  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.
  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7

In [9]:
error_building_test1_fit = []
error_floor_test1_fit = []
error_area_test1_fit = []
predict_floor_argmax_test1_fit = predict_floor_test1_fit.argmax(axis = 1)
predict_building_argmax_test1_fit = predict_building_test1_fit.argmax(axis = 1)
predict_area_argmax_test1_fit = predict_area_test1_fit.argmax(axis = 1)
for i in range(len(predict_floor_test1_fit)):
    if predict_floor_argmax_test1_fit[i] != y_test1[i, 2]:
        error_floor_test1_fit.append(i)
    if predict_building_argmax_test1_fit[i] != y_test1[i, 3]:
        error_building_test1_fit.append(i)
    if predict_area_argmax_test1_fit[i] != y_test1[i, 4]:
        error_area_test1_fit.append(i)

AxisError: axis 1 is out of bounds for array of dimension 1

In [None]:
error_rate_floor_test1_fit = len(error_floor_test1_fit) / len(predict_floor_test1_fit)
print('error_rate_floor_test1_fit: ', error_rate_floor_test1_fit)
error_rate_building_test1_fit = len(error_building_test1_fit) / len(predict_building_test1_fit)
print('error_rate_building_test1_fit: ', error_rate_building_test1_fit)
error_rate_area_test1_fit = len(error_area_test1_fit) / len(predict_area_test1_fit)
print('error_rate_area_test1_fit', error_rate_area_test1_fit)

In [None]:
error_distance_test1_fit = []
for i in range(len(predict_long_test1_fit)):
    error_distance_test1_fit.append(euclidean_distance(predict_lat_test1_fit[i], predict_long_test1_fit[i], y_test1[i, 1], y_test1[i, 0]))
error_mean_distance_test1_fit = np.mean(np.stack(error_distance_test1_fit)).item()
error_max_distance_test1_fit = np.max(np.stack(error_distance_test1_fit)).item()
error_min_distance_test1_fit = np.min(np.stack(error_distance_test1_fit)).item()
error_std_distance_test1_fit = np.std(np.stack(error_distance_test1_fit)).item()
error_var_distance_test1_fit = np.var(np.stack(error_distance_test1_fit)).item()
print('error_mean_distance_test1_fit: ', error_mean_distance_test1_fit)
print('error_max_distance_test1_fit: ', error_max_distance_test1_fit)
print('error_min_distance_test1_fit: ', error_min_distance_test1_fit)
print('error_std_distance_test1_fit: ', error_std_distance_test1_fit)
print('error_var_distance_test1_fit: ', error_var_distance_test1_fit)