# CapStone Project
Name: Richard Valades
Updated: 2022.12.13

# The goal of this project is to create and evaluate 4 Machine Learning Models to make predictions on location based on wi-fi signal data 

In [2]:
# Based on the wi-fi signals project that was created in R.
# This project follows a similar process with the learnings from that project being applied in a Python environment.

# Import Packages

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Load the dataset
LocData = pd.read_csv('TrainingData.csv', index_col=False)
print(LocData.shape)

(19937, 529)


In [4]:
#Examine the features of the data
LocData.head()

Unnamed: 0,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP,WAP001,...,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
0,-7541.2643,4864920.778,2,1,106,2,2,23,1371713733,100,...,100,100,100,100,100,100,100,100,100,100
1,-7536.6212,4864934.225,2,1,106,2,2,23,1371713691,100,...,100,100,100,100,100,100,100,100,100,100
2,-7519.1524,4864949.532,2,1,103,2,2,23,1371714095,100,...,100,100,100,100,100,100,100,100,100,100
3,-7524.5704,4864934.093,2,1,102,2,2,23,1371713807,100,...,100,100,100,100,100,100,100,100,100,100
4,-7632.1436,4864982.217,0,0,122,2,11,13,1369909710,100,...,100,100,100,100,100,100,100,100,100,100


In [5]:
#Remove unnecessary features
LocData = LocData.drop(columns=['USERID','PHONEID','LONGITUDE','LATITUDE','TIMESTAMP'])

In [6]:
LocData.head()

Unnamed: 0,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,...,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
0,2,1,106,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1,2,1,106,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,2,1,103,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
3,2,1,102,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4,0,0,122,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


In [7]:
#Check data types
LocData.dtypes

FLOOR               int64
BUILDINGID          int64
SPACEID             int64
RELATIVEPOSITION    int64
WAP001              int64
                    ...  
WAP516              int64
WAP517              int64
WAP518              int64
WAP519              int64
WAP520              int64
Length: 524, dtype: object

In [8]:
#Change data type to string to combine columns for dependent variable
LocData = LocData.astype(str)

In [9]:
#Confirm data types
LocData.dtypes

FLOOR               object
BUILDINGID          object
SPACEID             object
RELATIVEPOSITION    object
WAP001              object
                     ...  
WAP516              object
WAP517              object
WAP518              object
WAP519              object
WAP520              object
Length: 524, dtype: object

In [10]:
#combine columns to create unique location ID as depedent variable
LocData['POSITIONID'] = LocData['FLOOR'] + LocData['BUILDINGID'] + LocData['SPACEID'] + LocData['RELATIVEPOSITION']

In [11]:
#Confirm new object is created
LocData.head()

Unnamed: 0,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,...,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520,POSITIONID
0,2,1,106,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211062
1,2,1,106,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211062
2,2,1,103,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211032
3,2,1,102,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211022
4,0,0,122,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,1222


In [12]:
#Change data types back to integer for modeling
LocData = LocData.astype(np.int64)

In [13]:
#Confirm data types
LocData.dtypes

FLOOR               int64
BUILDINGID          int64
SPACEID             int64
RELATIVEPOSITION    int64
WAP001              int64
                    ...  
WAP517              int64
WAP518              int64
WAP519              int64
WAP520              int64
POSITIONID          int64
Length: 525, dtype: object

In [14]:
#Confirm features
LocData.head()

Unnamed: 0,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,...,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520,POSITIONID
0,2,1,106,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211062
1,2,1,106,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211062
2,2,1,103,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211032
3,2,1,102,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211022
4,0,0,122,2,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,1222


In [15]:
# Verify shape of data after feature engineering
LocData.shape

(19937, 525)

In [16]:
print(LocData)

       FLOOR  BUILDINGID  SPACEID  RELATIVEPOSITION  WAP001  WAP002  WAP003  \
0          2           1      106                 2     100     100     100   
1          2           1      106                 2     100     100     100   
2          2           1      103                 2     100     100     100   
3          2           1      102                 2     100     100     100   
4          0           0      122                 2     100     100     100   
...      ...         ...      ...               ...     ...     ...     ...   
19932      3           1        1                 2     100     100     100   
19933      1           2      140                 2     100     100     100   
19934      3           1       13                 2     100     100     100   
19935      3           1      113                 2     100     100     100   
19936      3           1      112                 2     100     100     100   

       WAP004  WAP005  WAP006  ...  WAP512  WAP513 

In [17]:
# Subset data to one building
LocData = LocData[LocData['BUILDINGID'] == 1]

In [18]:
# Confirm only building 1 is present in dataset
print(LocData)

       FLOOR  BUILDINGID  SPACEID  RELATIVEPOSITION  WAP001  WAP002  WAP003  \
0          2           1      106                 2     100     100     100   
1          2           1      106                 2     100     100     100   
2          2           1      103                 2     100     100     100   
3          2           1      102                 2     100     100     100   
5          2           1      105                 2     100     100     100   
...      ...         ...      ...               ...     ...     ...     ...   
19930      3           1        4                 2     100     100     100   
19932      3           1        1                 2     100     100     100   
19934      3           1       13                 2     100     100     100   
19935      3           1      113                 2     100     100     100   
19936      3           1      112                 2     100     100     100   

       WAP004  WAP005  WAP006  ...  WAP512  WAP513 

In [19]:
LocData.shape

(5196, 525)

In [20]:
# Remove unnecessary features
LocData = LocData.drop(columns=['FLOOR','BUILDINGID','SPACEID','RELATIVEPOSITION'])

In [21]:
# Confirm dataset after feature elimination
LocData.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520,POSITIONID
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211062
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211062
2,100,100,100,100,100,100,100,-97,100,100,...,100,100,100,100,100,100,100,100,100,211032
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211022
5,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,211052


# Set seed and create objects for Train/Test Sets

In [22]:
#Set random sead
seed = 123

In [23]:
# Select Features and Dependent variable
Y_Location = LocData['POSITIONID']
X_Location = LocData.iloc[:,0:520]

In [25]:
# Check that IV columns are correct
X_Location.head()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP511,WAP512,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,100,100,100,100,100,100,100,-97,100,100,...,100,100,100,100,100,100,100,100,100,100
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
5,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


# Create Train and Test Sets

In [26]:
#Split data into training and test sets with 70/30 split

X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_Location,
                                            Y_Location,
                                            test_size = .30,
                                            random_state = seed)

print(X_Train.shape, X_Test.shape)
print(Y_Train.shape, Y_Test.shape)

(3637, 520) (1559, 520)
(3637,) (1559,)


# Create Models

In [27]:
# select classification algorithms 

algos = []
algos.append(('Random Forest Classifier', RandomForestClassifier()))
algos.append(('Gradient Boosting Classifier', GradientBoostingClassifier()))
algos.append(('Decision Tree Classifier', DecisionTreeClassifier()))
algos.append(('KNeighbors Classifier', KNeighborsClassifier()))  

In [29]:
# build models

results = []
names = []

for name, model in algos:
    result = cross_val_score(model, X_Train, Y_Train, cv=4)
    names.append(name)
    results.append(result)

In [30]:
# evaluate results

for i in range(len(names)):
    print(names[i],results[i].mean())

Random Forest Classifier 0.8325541894848825
Gradient Boosting Classifier 0.5691527944003192
Decision Tree Classifier 0.6406399980657407
KNeighbors Classifier 0.5993934283538244


# Validate

In [31]:
#Select best model(s) to tune and validate with the test set

bestAlgo = RandomForestClassifier()
bestAlgopred = bestAlgo.fit(X_Train, Y_Train).predict(X_Test)
print(classification_report(Y_Test, bestAlgopred))
print(confusion_matrix(Y_Test, bestAlgopred))

              precision    recall  f1-score   support

         111       1.00      1.00      1.00         3
         112       0.83      1.00      0.91         5
         132       1.00      0.86      0.92         7
         162       1.00      1.00      1.00         3
         191       0.75      1.00      0.86         3
         192       0.75      0.75      0.75         8
        1101       0.20      0.50      0.29         2
        1102       1.00      0.88      0.93         8
        1122       1.00      1.00      1.00         3
        1132       1.00      0.33      0.50         3
        1142       0.78      1.00      0.88         7
        1152       1.00      1.00      1.00         5
        1162       1.00      0.86      0.92         7
        1172       1.00      1.00      1.00         3
        1182       0.62      0.62      0.62         8
        2121       1.00      0.75      0.86         4
        2122       1.00      1.00      1.00         3
        2131       1.00    

# Predict

In [32]:
predictions = bestAlgo.predict(X_Test)

In [33]:
print(predictions)

[  2192  12032 212032 ... 111071  11192   1101]
