In [None]:
# Python >= 3.5 is required
import sys
assert sys.version_info >= (3,5)

# Scikit-Learn >= 0.20 is required
import sklearn 
assert sklearn.__version__ >= "0.20"

# common imports
import pandas as pd
import numpy as np 
import glob
import os 
from numpy import genfromtxt

PROJECT_ROOT_DIR = "."
DATASET_PATH = os.path.join(PROJECT_ROOT_DIR, "data")

## Part 1. <font color=green>Data Loading</font>

### The dataset is split into 3 parts (train, validation and test) and stored in a directory located at DATASET_PATH
The data is stored in the csv format. Each split contains some number of csv files. Each csv file in the TRAIN and VALIDATION splits contains 223 columns: the first 220 represent the access points from which the sensor data was received; the last 3 are the ground truth values on the x,y,z location. Note, there is NO header with column names in any of the csv files.

In [None]:
# path to csv files
path_to_train_csvs = os.path.join(DATASET_PATH, "train")
path_to_val_csvs = os.path.join(DATASET_PATH, "val")
path_to_test_csvs = os.path.join(DATASET_PATH, "test")

In [None]:
path_to_train_csvs

'./data/train'

In [None]:
# read one of the train csv files, note it does not contain a header with column names
some_csv = pd.read_csv(os.path.join(path_to_train_csvs, "1.csv"), names = list(range(0, 220))+['x', 'y', 'z'])
some_csv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,213,214,215,216,217,218,219,x,y,z
0,-37.0,-36.0,-36.0,-32.0,-33.0,-61.0,-52.0,-52.0,-70.0,-77.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0.016694,7.741971,1.801771
1,-35.0,-35.0,-35.0,-41.0,-41.0,-70.0,-54.0,-69.0,-100.0,-78.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0.042433,7.929269,1.801771
2,-30.0,-36.0,-32.0,-40.0,-35.0,-61.0,-55.0,-57.0,-62.0,-79.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-0.039225,8.876047,1.801771
3,-41.0,-42.0,-43.0,-42.0,-41.0,-56.0,-53.0,-52.0,-64.0,-77.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0.032879,9.865795,1.801771
4,-44.0,-44.0,-44.0,-35.0,-36.0,-53.0,-61.0,-61.0,-64.0,-77.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0.629612,10.710077,1.801771


In [None]:
# read one of the train csv files, note it does not contain a header with column names
some_csv = pd.read_csv(os.path.join(path_to_train_csvs, "1.csv"), names = list(range(0, 220))+['x', 'y', 'z'])
some_csv.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,213,214,215,216,217,218,219,x,y,z
80,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,59.534868,37.415831,1.801771
81,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,60.95248,37.470104,1.801771
82,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,62.354246,37.5878,1.801771
83,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,63.527279,37.550523,1.801771
84,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,64.719473,37.583825,1.801771


Each csv file in the TEST split contains only 220 columns that represent the access points from which the sensor data was received; there are NO columns containing the ground truth values on the x,y,z location. 

In [None]:
# read one of the test csv files, note it does not contain a header with column names
some_csv = pd.read_csv(os.path.join(path_to_test_csvs, "1.csv"), header=None)
some_csv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,210,211,212,213,214,215,216,217,218,219
0,-87.0,-84.0,-84.0,-85.0,-82.0,-63.0,-59.0,-59.0,-74.0,-62.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
1,-85.0,-83.0,-84.0,-80.0,-77.0,-61.0,-64.0,-65.0,-69.0,-60.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
2,-86.0,-83.0,-85.0,-82.0,-82.0,-63.0,-56.0,-59.0,-79.0,-65.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
3,-100.0,-100.0,-100.0,-83.0,-83.0,-70.0,-64.0,-66.0,-84.0,-72.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
4,-100.0,-100.0,-100.0,-88.0,-86.0,-69.0,-78.0,-74.0,-85.0,-74.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0


### Task 1. Write a function called <font color=blue>build_feats</font>, that constructs only a feature matrix.
The function takes only one parameter, a string containing the path to the data (csv files)
and returns a feature matrix of type np.ndarray. Note, that the csv files are numbered in order from 1 to N, where N is the number of csv files in a split, so when you are accumulating the features, make sure to go through the csv files in that order.
The function will be used to build test data.

In [46]:
path_to_test_csvs

'./data/test'

In [193]:
def build_feats(path_to_csvs):
    
    my_data = genfromtxt(f'{path_to_csvs}/1.csv', delimiter=',')
    
    my_data2 = genfromtxt(f'{path_to_csvs}/2.csv', delimiter=',')
    
    feats = np.concatenate((my_data, my_data2), axis=0)

    
    files = os.path.join(path_to_csvs, "*.csv")

    files = glob.glob(files)
    
    files.remove(f'{path_to_csvs}/1.csv')
    files.remove(f'{path_to_csvs}/2.csv')
    
    for file in files:
        my_data = genfromtxt(file, delimiter=',')
        feats = np.concatenate((feats, my_data), axis=0)
    
    return feats

In [194]:


feats_test = build_feats(path_to_test_csvs)




# verify that the returned value is of type numpy.ndarray
assert(isinstance(feats_test, np.ndarray))



### Task 2. Write a function called <font color=blue>build_feats_targets</font>, that constructs a feature matrix and the corresponding target matrix. 
The function takes only one parameter, a string containing the path to the data (csv files),
and returns a tuple containing the feature matrix and the target matrix, both of type numpy.ndarray. Remember, the csv files contain both features and target values. The function should read the data from each csv file and concatenate the corresponding portions.

The function will be used to build train and validation data.


In [178]:
def build_feats_targets(path_to_csvs):
    
    my_data = genfromtxt(f'{path_to_csvs}/1.csv', delimiter=',')
    
    my_data2 = genfromtxt(f'{path_to_csvs}/2.csv', delimiter=',')
    
    data_set = np.concatenate((my_data, my_data2), axis=0)
    
    files = os.path.join(path_to_csvs, "*.csv")

    files = glob.glob(files)
    
    files.remove(f'{path_to_csvs}/1.csv')
    files.remove(f'{path_to_csvs}/2.csv')
    
    for file in files:
        my_data = genfromtxt(file, delimiter=',')
        data_set = np.concatenate((data_set, my_data), axis=0)

    feats = data_set[:, :-3]
    targets = data_set[:, -3:]
    
    
    
    return feats, targets

In [192]:
train_data = build_feats_targets(path_to_train_csvs)

# verify that the returned value is indeed a tuple
assert(isinstance(train_data, tuple))

feats_train, targets_train = train_data


# verify that the returned tuple elements are indeed numpy.ndarray
assert(isinstance(feats_train, np.ndarray))
assert(isinstance(targets_train, np.ndarray))


# verify dimensions of the returned feature matrix and a target matrix
# assert(feats_train.shape == (6049,220))
# assert(targets_train.shape == (6049,3))


In [180]:
val_data = build_feats_targets(path_to_val_csvs)

# test that the returned value is indeed a tuple
assert(isinstance(val_data, tuple))

feats_val, targets_val = val_data



# verify that the returned tuple elements are indeed numpy.ndarray
assert(isinstance(feats_val, np.ndarray))
assert(isinstance(targets_val, np.ndarray))

# verify dimensions of the returned feature matrix and a target matrix
# assert(feats_val.shape == (1976,220))
# assert(targets_val.shape == (1976,3))

## Part 2. <font color=green>Predicting of a user's coordinates using randrom forest regression</font>


Random forest is a supervised learning algorithm which can be utilized for both classification and regression problems. It combines multiple decision trees that train on various subsets of given data. The trees are run independently from each other, and in parallel. The output of the random forest is the mode of the class predictions or the mean of the value predictions, for classification and regression tasks respectively.

Today we will use a random forest regressor for indoor localization by predicting the x,y,z coordinate values of a user based on the 220 signal values available at that point.


See the documentation on the list of parameters to tune https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [181]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# can be updated to include a greater variety of parameters and their values to explore
param_grid = [
    {'bootstrap': [False], 'n_estimators': [10, 20], 'max_features': [5, 10]},
  ]

# please fix the value of random_state to 42 for reproducibility
forest_reg = RandomForestRegressor(n_jobs=10, random_state=42)

# we recommend using the exhaustive search over specified parameter values,
# but feel free to explore other approaches
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(feats_train, targets_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=10, random_state=42),
             param_grid=[{'bootstrap': [False], 'max_features': [5, 10],
                          'n_estimators': [10, 20]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [182]:
# you can check out the result details in a dataframe format
cvres = grid_search.cv_results_
pd.set_option("max_colwidth", 80)
df_cvres = pd.DataFrame(cvres)
df_cvres[["params", "mean_test_score", "std_test_score"]]


Unnamed: 0,params,mean_test_score,std_test_score
0,"{'bootstrap': False, 'max_features': 5, 'n_estimators': 10}",-3.190576,0.051042
1,"{'bootstrap': False, 'max_features': 5, 'n_estimators': 20}",-2.658822,0.134583
2,"{'bootstrap': False, 'max_features': 10, 'n_estimators': 10}",-2.710603,0.133709
3,"{'bootstrap': False, 'max_features': 10, 'n_estimators': 20}",-2.352558,0.1133


### <font color=red>NOTE !<font> 
Scikit-Learn’s cross-validation features expect a utility function (greater is better) rather than a cost function (lower is better), so the scoring function is actually the opposite of the MSE (i.e., a negative value), which is why the preceding code computes -scores before calculating the square root.

    
From Geron, A. (2019). Hands-on machine learning with Scikit-Learn, Keras and TensorFlow: concepts, tools, and techniques to build intelligent systems (2nd ed.). O’Reilly.

In [183]:
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("MSE={} for model parameters {}".format(np.sqrt(-mean_score), params))

MSE=1.7862182962404805 for model parameters {'bootstrap': False, 'max_features': 5, 'n_estimators': 10}
MSE=1.6305895373260344 for model parameters {'bootstrap': False, 'max_features': 5, 'n_estimators': 20}
MSE=1.6463908244888323 for model parameters {'bootstrap': False, 'max_features': 10, 'n_estimators': 10}
MSE=1.5338051096056087 for model parameters {'bootstrap': False, 'max_features': 10, 'n_estimators': 20}


In [184]:
# or simply retrieve the parameters that gave the best score
grid_search.best_params_

{'bootstrap': False, 'max_features': 10, 'n_estimators': 20}

### Task 3. Write a function called <font color=blue>mean_error_dist</font>
The function takes two parameters, the targets (or ground truth values) and the predictions,
and returns the mean error distance.


mean error distance $=\huge\frac{\sum_{i}^Ndist(p_i,\hat{p_i})}{N}$

Where $\large N$ is the total number of samples, $\large dist(p_i,\hat{p_i})$ is an Euclidean distance between i-th reference position $\large p_i$ and it's predicted position $\large \hat{p_i}$

In [185]:
def mean_error_dist(targets, preds):
    sum = 0
    for i in range(len(targets)):
        sum += np.linalg.norm(targets[i] - preds[i])
    dist = sum / len(targets)
        
    
    return dist


The target values for the test set are withheld. Instead, you are given the validation set to verify your best estimator.

In [186]:
from sklearn.metrics import mean_squared_error

forest_preds = grid_search.best_estimator_.predict(feats_val)

forest_med = mean_error_dist(targets_val, forest_preds)

forest_mse = mean_squared_error(targets_val, forest_preds)
forest_rmse = np.sqrt(forest_mse)

print("Random Forest. RMSE: {:.3f}, MSE: {:.3f}, MED: {:.3f}".format(forest_rmse, forest_mse, forest_med))

Random Forest. RMSE: 1.470, MSE: 2.162, MED: 1.997


### Feel free to experiment with the parameters or come up with a different approach.  Email us (<font color=blue>issai@nu.edu.kz</font>) the predictions of your best estimator on the test features AND your solutions to see how well you did !  The best mean error distance we got so far is 1.44.
<font color=red> Please don't forget that the target values of test set are stacked in the order of csv files. If your test features do not follow this order, your result will be ruined.</font>

In [187]:
name = "John" # change to your first name
surname = "Snow" # change to your lastname
forest_preds = grid_search.best_estimator_.predict(feats_test)

# email your csv file to issai@nu.edu.kz
pd.DataFrame(forest_preds).to_csv("{}_{}.csv".format(name, surname), header=None, index=None)