# Set things up

In [1]:
import os
import pandas as pd
import datetime
from sklearn.model_selection import StratifiedShuffleSplit

DATA_SOURCE = "C:/Users/sbranchett/Data/WiFi_data"
ACCESS_POINT = os.path.join(DATA_SOURCE, "test100.csv")

uni_hols = (datetime.date(year=2021, month=5, day=5),
            datetime.date(year=2021, month=5, day=13),
            datetime.date(year=2021, month=5, day=14),
            datetime.date(year=2021, month=5, day=24)
           )  # National Holidays and Collective Free days

def rooster(date):
    # categorise the days of the academic year at TU Delft between 1 May 2021 and 29 August 2021
    
    if (date > datetime.date(year=2021, month=5, day=16)) and \
       (date < datetime.date(year=2021, month=5, day=22)):
        categorie = "Exam_BSc"
    elif (date > datetime.date(year=2021, month=6, day=15)) and \
         (date < datetime.date(year=2021, month=6, day=19)):
        categorie = "Study_mixed"
    elif (date > datetime.date(year=2021, month=6, day=22)) and \
         (date < datetime.date(year=2021, month=6, day=26)):
        categorie = "Exam_mixed"
    elif (date > datetime.date(year=2021, month=6, day=29)) and \
         (date < datetime.date(year=2021, month=7, day=3)):
        categorie = "Exam"
    elif (date > datetime.date(year=2021, month=8, day=8)) and \
         (date < datetime.date(year=2021, month=8, day=14)):
        categorie = "Exam"
    elif (date > datetime.date(year=2021, month=7, day=4)):
        categorie = "Free"
    else:
        categorie = "Learn"
    return categorie

# Get the interesting fields from the WiFi data

In [2]:
def load_wifi_data(interesting_columns, path=ACCESS_POINT):
    raw_data = pd.read_csv(path, delimiter=",")
    raw_data = raw_data.rename(columns=lambda x: x.strip())
    return raw_data[interesting_columns]

interesting_columns = ["timestamp", "clientCount", "locationHierarchy"]
all_data = load_wifi_data(interesting_columns)

# Extract Building

In [3]:
all_data["building"] = all_data["locationHierarchy"].str.split(" > ",expand=True)[1]

# Create hourly time buckets

In [4]:
# convert timestamp, given as Epoch in milliseconds, to datetime
all_data["datetime"] = all_data["timestamp"].apply(lambda d: datetime.datetime.fromtimestamp(int(d/1000)))

# separate out the date and the hour
all_data["date"] = all_data["datetime"].apply(lambda d: datetime.datetime.date(d))
all_data["hour"] = all_data["datetime"].apply(lambda d: datetime.time(d.hour, 0, 0))

# create buckets of an hour
all_data["hour_bucket"] = all_data.apply(lambda row: datetime.datetime.combine(row["date"], row["hour"]), axis=1)

# Create Work days and Student days

In [5]:
all_data["weekday"] = all_data.apply(lambda row: row["date"].weekday() < 5, axis=1)
all_data["uni_hols"] = all_data.apply(lambda row: row["date"].weekday() in uni_hols, axis=1)
all_data["academic_yr"] = all_data.apply(lambda row: rooster(row["date"]), axis=1)

# Separate Test Set stratified over buildings

In [6]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(all_data, all_data["building"]):
    strat_train_set = all_data.loc[train_index]
    strat_test_set = all_data.loc[test_index]

print(strat_train_set)

        timestamp  clientCount                          locationHierarchy  \
97  1620991804473            1     TUDelft > 26-Bouwcampus > Beganegrond    
74  1620991804468            0         TUDelft > 20-Aula > 2e Verdieping    
13  1620991804455            0            TUDelft > 31-TBM > Beganegrond    
92  1620991804472            2   TUDelft > 26-Bouwcampus > 2e verdieping    
36  1620991804460            0         TUDelft > 20-Aula > 2e Verdieping    
..            ...          ...                                        ...   
64  1620991804466            0         TUDelft > 20-Aula > 2e Verdieping    
55  1620991804464            0         TUDelft > 20-Aula > 2e Verdieping    
78  1620991804469            0         TUDelft > 20-Aula > 2e Verdieping    
81  1620991804469            0         TUDelft > 20-Aula > 2e Verdieping    
94  1620991804472            0   TUDelft > 26-Bouwcampus > 1e verdieping    

         building            datetime        date      hour  \
97  26-Bouwc

# Bucketise clientCounts, total per building and total on campus