In [50]:
#necessary Libraries
import numpy as np
import pandas as pd
import time
import pprint

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
# magic word for producing visualizations in notebook
%matplotlib inline

#Preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA 
from scipy.sparse import lil_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Models
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

#Scoring Metrics
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import accuracy_score

In [51]:
# Load the dataset
file_path = r'C:\Users\Thesis2.0\django_thesis\KNN Algorithm\ap_data_2.csv'
ap_data = pd.read_csv(file_path)

In [52]:
# Check the structure of the data after it's loaded 
#(print the number of rows and columns).
num_rows, num_cols  = ap_data.shape
print('Number of columns: {}'.format(num_cols))
print('Number of rows: {}'.format(num_rows))

Number of columns: 9
Number of rows: 11816


In [53]:
#check the statistics of the data per columns
ap_data.describe()

Unnamed: 0,channel,signal_strength,floorid,latitude,longitude
count,11816.0,11816.0,11816.0,11816.0,11816.0
mean,2433.421378,-70.242891,137.27065,7.06578,125.596436
std,17.130114,11.85718,39.880917,4e-05,3.4e-05
min,2412.0,-104.0,1.0,7.065631,125.59635
25%,2417.0,-79.0,119.0,7.065763,125.596407
50%,2437.0,-70.0,143.0,7.065786,125.596433
75%,2447.0,-61.0,166.0,7.065806,125.596465
max,2462.0,-8.0,189.0,7.065841,125.596502


In [54]:
#Check the columns names
col_names = ap_data.columns.values
col_names

array(['mac_address', 'ssid', 'channel', 'source', 'signal_strength',
       'floorid', 'latitude', 'longitude', 'timestamp'], dtype=object)

In [55]:
#check for missing values
missing_values_count = ap_data.isnull().sum()
#uncomment this if you want to see the count of missing data per column
#missing_values_count

# how many total missing values do we have?
total_cells = np.product(ap_data.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
missing_percent = (total_missing/total_cells) * 100

print('Percent of missing data = {}%'.format(missing_percent))

Percent of missing data = 0.0%


In [56]:
# Add a new column 'source_without_C' by removing 'C' from 'source'
ap_data['ssid'] = ap_data['ssid'].str.replace('C', '')

# Convert the 'source_without_C' column to numeric
ap_data['ssid'] = pd.to_numeric(ap_data['ssid'], errors='coerce')

In [57]:
#Assess unique values per columns
unique_mac_address = ap_data["mac_address"].unique()
unique_ssid = ap_data["ssid"].unique()
unique_channel = ap_data["channel"].unique()
unique_floorid = ap_data["floorid"].unique()

print('Unique Mac Address : {}'.format(unique_mac_address))
print('Unique SSID : {}'.format(unique_ssid))
print('Unique Channel : {}'.format(unique_channel))
print('Unique Floor ID : {}'.format(unique_floorid))


Unique Mac Address : ['6A:BD:12:5B:D6:64' 'F6:CE:87:F2:06:21' 'A2:89:5E:B6:E7:58'
 '02:9D:2F:8D:49:90' 'FE:47:AD:D7:13:E2' '56:3A:A2:F8:0C:63'
 '7A:6B:C2:5A:7B:88' '52:39:94:90:76:D2' '1E:03:B6:E0:9E:3C'
 '56:DE:9D:83:4D:C6' 'B6:6A:AD:C1:CF:19' 'E6:4C:39:FC:36:8B'
 'BE:7E:CC:35:1C:46' '7A:44:1F:B5:90:E3' '22:95:8E:C1:1D:93']
Unique SSID : [11  5  7  6  1  3  8 10  2  9  4]
Unique Channel : [2462 2457 2452 2422 2412 2417 2437 2432 2447 2442 2427]
Unique Floor ID : [121 101 102 103 104 105 106 107 108 109 110 111 112 117 118 119 120 127
 128 129 130 137 138 139 140 147 148 149 150 157 158 159 160 167 168 169
 170 122 123 124 125 126 131 132 133 134 135 136 141 142 143 144 145 146
 151 152 153 154 155 156 161 162 163 164 165 166 171 172 173 174 175 176
 177 178 179 180 181 182 183 184 185 186 187 188 189   1   2   3   4   5
   6   7   8   9  10  11  12  13  14  15  16  17  18]


In [None]:
#Preprocess the Training Data Set //////////////////////////////////////////////////////////////////////////////////////////////

In [95]:
# Load the dataset
trainingData = ap_data

print(trainingData)

             mac_address  ssid  channel source  signal_strength  floorid  \
0      6A:BD:12:5B:D6:64    11     2462   cap1              -61      121   
1      6A:BD:12:5B:D6:64    11     2462   cap1              -52      121   
2      6A:BD:12:5B:D6:64    11     2462   cap1              -53      121   
3      6A:BD:12:5B:D6:64    11     2457   cap1              -51      121   
4      6A:BD:12:5B:D6:64    11     2462   cap1              -52      121   
...                  ...   ...      ...    ...              ...      ...   
11811  22:95:8E:C1:1D:93     2     2457   cap2              -65       18   
11812  22:95:8E:C1:1D:93     2     2462   cap1              -82       18   
11813  FE:47:AD:D7:13:E2     1     2447   cap2              -66       18   
11814  22:95:8E:C1:1D:93     2     2462   cap2              -65       18   
11815  7A:44:1F:B5:90:E3    11     2412   cap2              -81       18   

       latitude   longitude         timestamp  
0      7.065750  125.596484  11/11/2023

In [None]:
import pandas as pd
from itertools import product

# Assuming 'ap_data' is a DataFrame containing your data

# Find unique SSIDs and floor IDs in the dataset
unique_ssids = trainingData['ssid'].unique()
unique_floor_ids = trainingData['floorid'].unique()

# Initialize an empty DataFrame to store the combinations
combinations_df = pd.DataFrame(columns=[
    'mac_address', 'ssid', 'latitude', 'longitude', 'floorid', 'timestamp',
    'channel_cap1', 'channel_cap2', 'channel_cap3',
    'signal_strength_cap1', 'signal_strength_cap2', 'signal_strength_cap3'
])

# Iterate over unique SSIDs, floor IDs, and extract unique channels for each 'cap' category
for ssid, floor_id in product(unique_ssids, unique_floor_ids):
    # Filter data for the specific SSID and floor ID
    specific_ssid_floor_data = trainingData[(trainingData['ssid'] == ssid) & (trainingData['floorid'] == floor_id)]

    # Filter data for each 'cap' category within the specific SSID and floor ID
    cap1_data = specific_ssid_floor_data[specific_ssid_floor_data['source'] == 'cap1']
    cap2_data = specific_ssid_floor_data[specific_ssid_floor_data['source'] == 'cap2']
    cap3_data = specific_ssid_floor_data[specific_ssid_floor_data['source'] == 'cap3']

    # Extract unique channels for each 'cap' category within the specific SSID and floor ID
    unique_channels_cap1 = cap1_data['channel'].unique()
    unique_channels_cap2 = cap2_data['channel'].unique()
    unique_channels_cap3 = cap3_data['channel'].unique()

    # Generate all combinations of unique channels and floor ID
    all_combinations = product(unique_channels_cap1, unique_channels_cap2, unique_channels_cap3)

    # Append combinations to the DataFrame
    for combination in all_combinations:
        combinations_df = combinations_df.append({
            'mac_address': specific_ssid_floor_data['mac_address'].iloc[0],
            'ssid': ssid,
            'latitude': specific_ssid_floor_data['latitude'].iloc[0],
            'longitude': specific_ssid_floor_data['longitude'].iloc[0],
            'floorid': floor_id,
            'timestamp': specific_ssid_floor_data['timestamp'].iloc[0],
            'channel_cap1': combination[0],
            'channel_cap2': combination[1],
            'channel_cap3': combination[2],
            'signal_strength_cap1': cap1_data[cap1_data['channel'] == combination[0]]['signal_strength'].iloc[0],
            'signal_strength_cap2': cap2_data[cap2_data['channel'] == combination[1]]['signal_strength'].iloc[0],
            'signal_strength_cap3': cap3_data[cap3_data['channel'] == combination[2]]['signal_strength'].iloc[0],
        }, ignore_index=True)

# Print the resulting DataFrame with all combinations
print(combinations_df)


In [90]:
def clean_data(df):
    """
    Perform feature trimming, and engineering for trainingData
    Will also be applied to validationData
    
    INPUT: trainingData DataFrame
    OUTPUT: Trimmed and cleaned trainingData DataFrame
    """
    
    # Reverse the representation for the values. 100=0 and teh values range from 0-105 (weakest to strongest)
    #"The intensity values are represented as negative integer values ranging -104dBm (extremely poor signal) to 0dbM.
    #The positive value 100 is used to denote when a WAP was not detected."
    df.iloc[:, 9:12] = np.where(df.iloc[:, 9:12] <= 0, 
                df.iloc[:, 9:12] + 105, 
                df.iloc[:, 9:12] - 100)
    
    '''
    df.iloc[:, 6:9] = np.where(df.iloc[:, 6:9] > 2000, 
                df.iloc[:, 6:9] - 2300, 
                df.iloc[:, 6:9] - 0)
    '''
    
    # remove selected columns... 
    columns_removed = ['mac_address','timestamp']
    for col in columns_removed:
        df.drop(col, axis=1, inplace=True)
    
   
    # Return the cleaned dataframe.
    return df 

In [91]:
#Apply Cleaning

trainingData  = clean_data(combinations_df)

trainingData.to_csv('trainingData.csv', index=False)

In [82]:
import pandas as pd

def preprocess_data(df):
    """
    Separates trainingData into Features and Targets
    Will also be applied to validationData
    
    INPUT: Cleaned trainingData DataFrame
    OUTPUT: trainingData as Features and Targets
    """
    # split the data set into features and targets(Floor and BuildingID)
    X = df.drop(['longitude', 'latitude', 'floorid'], axis=1)
    y = df[['floorid']]
    
    # Extract unique channel values
    unique_channels = sorted(set(df['channel_cap1'].unique()) | set(df['channel_cap2'].unique()) | set(df['channel_cap3'].unique()))
    
    # Create new one-hot encoded columns
    for channel in unique_channels:
        X[f'channel_cap1_{channel}'] = (df['channel_cap1'] == channel).astype(int)
        X[f'channel_cap2_{channel}'] = (df['channel_cap2'] == channel).astype(int)
        X[f'channel_cap3_{channel}'] = (df['channel_cap3'] == channel).astype(int)
    
    # Drop the original 'channel_cap1', 'channel_cap2', and 'channel_cap3' columns
    X.drop(['channel_cap1', 'channel_cap2', 'channel_cap3'], axis=1, inplace=True)
    
    # Iterate over signal strength caps and channels to perform multiplication
    signal_columns = ['signal_strength_cap1', 'signal_strength_cap2', 'signal_strength_cap3']
    
    for signal_col in signal_columns:
        for channel in unique_channels:
            channel_col1 = f'channel_cap1_{channel}'
            channel_col2 = f'channel_cap2_{channel}'
            channel_col3 = f'channel_cap3_{channel}'
            
            if signal_col.endswith('cap1'):
                X[f'{signal_col}_{channel_col1}'] = df[signal_col] * X[channel_col1]
            elif signal_col.endswith('cap2'):
                X[f'{signal_col}_{channel_col2}'] = df[signal_col] * X[channel_col2]
            elif signal_col.endswith('cap3'):
                X[f'{signal_col}_{channel_col3}'] = df[signal_col] * X[channel_col3]

    # Drop the original 'signal_strength' columns
    X.drop(['signal_strength_cap1', 'signal_strength_cap2', 'signal_strength_cap3'], axis=1, inplace=True)
    
    # Drop unwanted columns
    unwanted_columns = [f'channel_cap{i}_{cap}' for i in range(1, 4) for cap in unique_channels]
    X.drop(unwanted_columns, axis=1, inplace=True)
    
    # create Dummies for the targets to feed into the model
    y = pd.get_dummies(data=y, columns=['floorid'])
    
    return X, y


In [76]:
#Apply preprocessing

X, y = preprocess_data(trainingData)


#y.to_csv('y_values.csv', index=False)

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [65]:
print(X)


# Replace 'output_file.csv' with the desired file name
output_file = 'preprocessed_data.csv'

# Save the DataFrame to a CSV file
X.to_csv(output_file, index=False)


     ssid  signal_strength_cap1_channel_cap1_2412  \
0      11                                       0   
1      11                                       0   
2      11                                       0   
3      11                                       0   
4      11                                       0   
...   ...                                     ...   
2486    4                                       0   
2487    4                                       0   
2488    4                                       0   
2489    4                                       0   
2490    4                                       0   

      signal_strength_cap1_channel_cap1_2417  \
0                                          0   
1                                          0   
2                                          0   
3                                          0   
4                                          0   
...                                      ...   
2486                       

In [66]:
print(y)


      floorid_101  floorid_104  floorid_121
0               0            0            1
1               0            0            1
2               0            0            1
3               0            0            1
4               0            0            1
...           ...          ...          ...
2486            0            1            0
2487            0            1            0
2488            0            1            0
2489            0            1            0
2490            0            1            0

[2491 rows x 3 columns]


In [46]:
def split_data(preprocess_data):
# TO AVOID OVERFITTING: Split the training data into training and testing sets 
    global X_train
    global X_test
    global y_train
    global y_test
    
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size = 0.2, 
                                                        random_state = 42,
                                                        shuffle=True)

    # Show the results of the split
    print("Training set has {} samples.".format(X_train.shape[0]))
    print("Testing set has {} samples.".format(X_test.shape[0]))
    return X_train, X_test, y_train, y_test

In [47]:
#Apply split data

X_train, X_test, y_train, y_test = split_data(preprocess_data)



Training set has 12 samples.
Testing set has 4 samples.


In [48]:
#Scale Data with Standard Scaler

scaler = StandardScaler()

#Fit only the training set
#this will help us transform the validation data 
scaler.fit(X_train)
    
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

k = 1  # You can adjust the value of k
knn = KNeighborsClassifier(n_neighbors=k, p=2, metric = 'euclidean')  # p=2 for Euclidean metric
knn.fit(X_train, y_train)

# Assuming 'knn' is your trained KNN classifier
# Assuming 'X_test' and 'y_test' are your test features and labels
y_pred = knn.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 100.00%


  knn.fit(X_train, y_train)


In [29]:
# Assuming unique_channels is a list containing all unique channel values
unique_channels = [2417, 2422, 2412, 2427]

# Pivot the DataFrame to create separate columns for each 'cap'
ap_data_pivot = ap_data_selected.pivot_table(
    index=['mac_address', 'ssid', 'timestamp', 'floorid', 'latitude', 'longitude'],
    columns='source',
    values=['channel', 'signal_strength'],
    aggfunc='first',
    fill_value=0  # Specify the fill value for missing entries
).reset_index()

# Ensure that all unique channel columns are present in the DataFrame
for channel in unique_channels:
    if ('channel', channel) not in ap_data_pivot.columns:
        ap_data_pivot[('channel', channel)] = 0  # Add a column with the missing channel and fill with 0

# Sort the columns to have a consistent order
ap_data_pivot = ap_data_pivot.reorder_levels([1, 0], axis=1).sort_index(axis=1, level=[0, 1]).reset_index()

print(ap_data_pivot)

source index    cap1                    cap2                    cap3  \
             channel signal_strength channel signal_strength channel   
0          0       0               0    2417             -66    2412   
1          1    2412             -57    2417             -63    2412   
2          2    2427             -67    2417             -66    2412   

source                                                                        \
       signal_strength floorid  latitude   longitude        mac_address ssid   
0                  -89     101  7.065735  125.596478  FE:47:AD:D7:13:E2    1   
1                  -89     101  7.065735  125.596478  FE:47:AD:D7:13:E2    1   
2                  -93     101  7.065735  125.596478  FE:47:AD:D7:13:E2    1   

source                      2417    2422    2412    2427  
               timestamp channel channel channel channel  
0       24/11/2023 12:02       0       0       0       0  
1       24/11/2023 12:03       0       0       0       0  
2 

In [15]:


# Flatten the MultiIndex columns
ap_data_pivot.columns = [f'{col[0]}_{col[1]}' if col[1] else col[0] for col in ap_data_pivot.columns]

# Rename columns for clarity
ap_data_pivot.columns = [
    'mac_address', 'ssid', 'timestamp', 'floorid', 'latitude', 'longitude',
    'cap1_channel', 'cap2_channel', 'cap3_channel',
    'cap1_signal_strength', 'cap2_signal_strength', 'cap3_signal_strength'
]

# Replace missing signal_strength values with 100
ap_data_pivot['cap1_signal_strength'].fillna(100, inplace=True)
ap_data_pivot['cap2_signal_strength'].fillna(100, inplace=True)
ap_data_pivot['cap3_signal_strength'].fillna(100, inplace=True)

# Replace missing channel values with 0
ap_data_pivot['cap1_channel'].fillna(0, inplace=True)
ap_data_pivot['cap2_channel'].fillna(0, inplace=True)
ap_data_pivot['cap3_channel'].fillna(0, inplace=True)

# Remove rows if there are two zeroes in a row in the cap_channel
ap_data_pivot = ap_data_pivot[
    ~((ap_data_pivot['cap1_channel'] == 0) & (ap_data_pivot['cap2_channel'] == 0)) &
    ~((ap_data_pivot['cap1_channel'] == 0) & (ap_data_pivot['cap3_channel'] == 0)) &
    ~((ap_data_pivot['cap2_channel'] == 0) & (ap_data_pivot['cap3_channel'] == 0))
]

# Save the DataFrame to a CSV file
ap_data_pivot.to_csv('ap_data_processed-3.csv', index=False)

# Print the resulting DataFrame
print(ap_data_pivot.head())


         mac_address  ssid         timestamp  floorid  latitude   longitude  \
0  02:9D:2F:8D:49:90     6  10/11/2023 14:44      101  7.065735  125.596478   
1  02:9D:2F:8D:49:90     6  10/11/2023 14:45      101  7.065735  125.596478   
2  02:9D:2F:8D:49:90     6  10/11/2023 14:46      101  7.065735  125.596478   
3  02:9D:2F:8D:49:90     6  10/11/2023 14:47      101  7.065735  125.596478   
4  02:9D:2F:8D:49:90     6  10/11/2023 14:48      101  7.065735  125.596478   

   cap1_channel  cap2_channel  cap3_channel  cap1_signal_strength  \
0        2417.0        2417.0        2412.0                 -57.0   
1        2417.0        2412.0        2417.0                 -69.0   
2        2417.0        2412.0        2412.0                 -65.0   
3        2422.0        2422.0        2417.0                 -67.0   
4        2422.0        2412.0        2412.0                 -63.0   

   cap2_signal_strength  cap3_signal_strength  
0                 -74.0                 -94.0  
1             

In [35]:
# Load the dataset
ap_data_processed_data = r'C:\Users\pc\Desktop\Thesis\Untitled Folder 1\ap_data_processed.csv'
ap_data_processed = pd.read_csv(ap_data_processed_data)

In [18]:
print(len(ap_data_processed))

1962


In [19]:
# Check the structure of the data after it's loaded 
#(print the number of rows and columns).
num_rows, num_cols  = ap_data_processed.shape
print('Number of columns: {}'.format(num_cols))
print('Number of rows: {}'.format(num_rows))

Number of columns: 12
Number of rows: 1962


In [20]:
#Check the columns names
col_names = ap_data_processed.columns.values
col_names

array(['mac_address', 'ssid', 'timestamp', 'floorid', 'latitude',
       'longitude', 'cap1_channel', 'cap2_channel', 'cap3_channel',
       'cap1_signal_strength', 'cap2_signal_strength',
       'cap3_signal_strength'], dtype=object)

In [21]:
#Assess unique values per building columns
unique_mac_address = ap_data_processed["mac_address"].unique()
unique_ssid = ap_data_processed["ssid"].unique()
#unique_channel_cap1 = ap_data_processed["cap1_channel"].unique()
#unique_channel_cap2 = ap_data_processed["cap2_channel"].unique()
#unique_channel_cap3 = ap_data_processed["cap3_channel"].unique()
unique_floorid = ap_data_processed["floorid"].unique()

print('Unique Mac Address : {}'.format(unique_mac_address))
print('Unique SSID : {}'.format(unique_ssid))
#print('Unique Channel_cap1 : {}'.format(unique_channel_cap1))
#print('Unique Channel_cap2 : {}'.format(unique_channel_cap2))
#print('Unique Channel_cap3 : {}'.format(unique_channel_cap3))
print('Unique Floor ID : {}'.format(unique_floorid))


Unique Mac Address : ['02:9D:2F:8D:49:90' '1E:03:B6:E0:9E:3C' '22:95:8E:C1:1D:93'
 '52:39:94:90:76:D2' '56:3A:A2:F8:0C:63' '56:DE:9D:83:4D:C6'
 '6A:BD:12:5B:D6:64' '7A:44:1F:B5:90:E3' '7A:6B:C2:5A:7B:88'
 '8E:B0:7A:54:55:A6' 'A2:89:5E:B6:E7:58' 'B6:6A:AD:C1:CF:19'
 'BE:7E:CC:35:1C:46' 'E6:4C:39:FC:36:8B' 'F6:CE:87:F2:06:21'
 'FE:47:AD:D7:13:E2']
Unique SSID : [ 6  2 10  3  9 11  8  7  4  5  1]
Unique Floor ID : [101 102 103 104 105 106 107 108 109 110 111 112 117 118 119 120 127 128
 129 130 137 138 139 140 147 148 149 150 157 158 159 160 167 168 169 170
 187 188 189   1   5   6   7   9  10  11  12  13  14  15  16  17  18 113
 114 115 116 121   2   3   4   8 122 123 124 125 126 131 132 133 134 135
 136 141 142 143 144 145 146 151 152 153 154 155 156 161 162 163 164 165
 166 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186]
