In [1]:
import os
import pandas as pd
import datetime
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt

In [2]:
DATA_SOURCE = "C:/Users/sbranchett/Data/WiFi_data"
ACCESS_POINT = os.path.join(DATA_SOURCE, "WiFiAccessPoint.csv")

In [3]:
def load_wifi_data(path=ACCESS_POINT):
    """
    Read WiFi clientCount .csv file and add columns containing the building and timestamp bucketed to 5 minutes
    
    Input: filepath the .csv file
    Output: Dataframe with columns "timestamp", "id", "clientCount", "locationHierarchy" directly from file and
    columns "building", generated from "locationHierarchy", and "time_bucket", generated from "timestamp"
    """
    interesting_columns = ["timestamp", "id", "clientCount", "locationHierarchy", "building", "time_bucket"]
    all_data = pd.read_csv(path, delimiter=",")
    all_data = all_data.rename(columns=lambda x: x.strip())
    all_data["building"] = all_data["locationHierarchy"].str.split(" > ",expand=True)[1]
    all_data["building"] = all_data["building"].fillna("None")
    all_data["time_bucket"] = all_data["timestamp"].apply(lambda d: 300000*int(d/300000))
    return all_data[interesting_columns]

all_data = load_wifi_data()
print(all_data)

             timestamp           id  clientCount  \
0        1620991804452    119942019            0   
1        1620991804452    119942021            0   
2        1620991804453    119942023            0   
3        1620991804453    119942025            0   
4        1620991804453    119942027            0   
...                ...          ...          ...   
5494085  1621608617172  21431122764            0   
5494086  1621608617172  21431122766            0   
5494087  1621608617172  21431122768            0   
5494088  1621608617172  21431122770            0   
5494089  1621608617173  21431122772            0   

                             locationHierarchy   building    time_bucket  
0            TUDelft > 31-TBM > 3e Verdieping      31-TBM  1620991800000  
1            TUDelft > 31-TBM > 3e Verdieping      31-TBM  1620991800000  
2            TUDelft > 31-TBM > 4e Verdieping      31-TBM  1620991800000  
3            TUDelft > 31-TBM > 2e Verdieping      31-TBM  1620991800000  


## points can be in different buildings

## find all buildings

In [4]:
list_of_buildings = all_data["building"].value_counts()
print(type(list_of_buildings))
print(list_of_buildings)

<class 'pandas.core.series.Series'>
23-CITG                              625320
22-TNW-TN                            592416
34-3ME                               464882
08-BK-City                           442227
58-TNW-Zuid                          327063
36-EWI LB_K t/m 3 & HB_K  t/m 2e     296208
32-OCP-IO                            294151
62-LR                                277675
28- WNI                              222156
21-BTUD                              183073
36-EWI-HB                            174817
31-TBM                               170671
50-TNW-RID                           166617
20-Aula                              160436
26-Bouwcampus                        133705
03-Science Center                    111078
66-Fellowship                        102850
33-Pulse                              94622
37-Sportcentrum                       88451
35-Drebbelweg                         76109
30-IKC_ISD-FMVG                       61710
None                                  57

# add values per building

In [5]:
# total number of clientCount
print(all_data["clientCount"].sum())

3597531


In [6]:
# total number of rows is 5494090
print(list_of_buildings.values.sum())

5494090


In [7]:
sumall = 0
sumbucket = 0
for building in list_of_buildings.index:
    building_all = all_data.loc[all_data["building"] == building]
    building_bucket_sum = building_all.groupby("time_bucket").sum().drop(columns=["timestamp", "id"])
    sumall += building_all["clientCount"].sum()
    sumbucket += building_bucket_sum["clientCount"].sum()
    assert sumall == sumbucket
    print(building, sumall, sumbucket)
print(sumall)
print(sumbucket)

23-CITG 306197 306197
22-TNW-TN 682563 682563
34-3ME 1145511 1145511
08-BK-City 1497646 1497646
58-TNW-Zuid 1830373 1830373
36-EWI LB_K t/m 3 & HB_K  t/m 2e  1993549 1993549
32-OCP-IO 2206211 2206211
62-LR 2323487 2323487
28- WNI 2377954 2377954
21-BTUD 2602365 2602365
36-EWI-HB 2613628 2613628
31-TBM 2689696 2689696
50-TNW-RID 2798594 2798594
20-Aula 2839668 2839668
26-Bouwcampus 2913487 2913487
03-Science Center 2984048 2984048
66-Fellowship 3046216 3046216
33-Pulse 3153908 3153908
37-Sportcentrum 3200868 3200868
35-Drebbelweg 3240218 3240218
30-IKC_ISD-FMVG 3356127 3356127
None 3378882 3378882
36-ESP-Lab 3395319 3395319
38-Cultureel Centrum 3402366 3402366
64-HSL 3416876 3416876
61-Vliegtuighal 3445531 3445531
30-O S 3454312 3454312
25-GreenVillage 3502853 3502853
46-P E lab 3518828 3518828
32a- Learninglab 3519922 3519922
45-LSL 3541973 3541973
60-LMS 3551628 3551628
05-TNW-BIO 3558138 3558138
63-Simona 3563561 3563561
VLL-LAB(TNO) 3584345 3584345
43-EGM 3586936 3586936
Katalyse La