In [None]:
import pandas as pd
import datetime
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import pandas as pd
from sklearn.preprocessing import LabelEncoder as le
from collections import defaultdict


In [None]:
df_raw = pd.read_excel("output/resampled_df_10_min.xlsx", index_col=[0])

In [None]:
start_date = pd.to_datetime(f"2023-01-01 00:00:00")
end_date = pd.to_datetime(f"2023-05-01 23:50:00")
training_window_size = 7
horizon_size = 7
model_features = ["day", "weekday", "hour", "window_block"] # Day = day of the month (0-31), hour = hour of the day (0-24), weekday = day in the week (0-7), window_block = window block in the hour (0-5)

baseline_performance = defaultdict(dict)

In [None]:
df = df_raw[df_raw["time"].between(start_date, end_date)].copy()

label_encoder = le()
df.location = label_encoder.fit_transform(df.location)

In [None]:
df["day"] = df["time"].dt.day
df["weekday"] = df["time"].dt.dayofweek
df["hour"] = df["time"].dt.hour
df["window_block"] = ((df['time'].dt.minute * 60 + df['time'].dt.second) // 600).astype(int)


In [None]:
df.head(10)

In [None]:
train_start_date = start_date
train_end_date = train_start_date + pd.Timedelta(days=training_window_size-1, hours=23, minutes=50)
test_start_date = train_end_date + pd.Timedelta(minutes=10)
test_end_date = test_start_date + pd.Timedelta(days=horizon_size-1, hours=23, minutes=50)

train_mask = df["time"].between(train_start_date, train_end_date)
test_mask = df["time"].between(test_start_date, test_end_date)

# Split the data into train and test sets
X_train = df.loc[train_mask, model_features]
y_train = df.loc[train_mask, "location"]
X_test = df.loc[test_mask, model_features]
y_test = df.loc[test_mask, "location"]

print(f"Training: {train_start_date}-{train_end_date}, testing: {test_start_date}-{test_end_date}.")

In [None]:
training_data = df.loc[train_mask]
testing_data = df.loc[test_mask]
most_common_locations = training_data.groupby(model_features)['location'].apply(lambda x: x.value_counts().idxmax()).reset_index()

In [None]:
result_df = testing_data.merge(most_common_locations, how="left", left_on=model_features, right_on=model_features)

features_to_use = model_features[1:]
while result_df['location_y'].isna().sum() > 0:
    print('nan > 0, now trying with features: ', features_to_use)
    most_common_locations = training_data[["location"] + features_to_use].groupby(features_to_use)['location'].apply(lambda x: x.value_counts().idxmax()).reset_index()
    result_df = testing_data.merge(most_common_locations, how="left", left_on=features_to_use, right_on=features_to_use)
    features_to_use = features_to_use[1:]  # Remove the first element to exclude it from the next merge

predictions = result_df.location_y.values.tolist()



In [None]:
for d in range(horizon_size):
    # Then, evaluate the baseline's predictions and store acc in self.baseline_performance
    this_day_predictions = predictions[d*144:(d+1)*144]
    this_day_actual_values = y_test[d*144:(d+1)*144]
    acc = accuracy_score(this_day_actual_values, this_day_predictions)
    print(f"Acc of baseline: {acc}")

In [2]:
import DataLoader as DL
from Cluster import Cluster

# Initialize parameters.
data_source = "google_maps"  # Can be either 'google_maps' or 'routined'.
# hours_offset is used to offset the timestamps to account for timezone differences. For google maps, timestamp comes in GMT+0
# which means that we need to offset it by 2 hours to make it GMT+2 (Dutch timezone). Value must be INT!
hours_offset = 2 # Should be 0 for routined and 2 for google_maps. 
# begin_date and end_date are used to filter the data for your analysis.
begin_date = "2022-01-01"
end_date = "2022-12-30"  # End date is INclusive! 
# FRACTION is used to make the DataFrame smaller. Final df = df * fraction. This solves memory issues, but a value of 1 is preferred.
fraction = 1
# For the heatmap visualization we specify a separate begin_date and end_date (must be between begin_date and end_date).
# For readiness purposes, it it suggested to select between 2 and 14 days.
heatmap_begin_date = "2023-01-20"
heatmap_end_date = "2023-05-28"  # End date is INclusive! Choose a date that lies (preferably 2 days) before end_date to avoid errors. 
# For the model performance class we need to specify the number of training days (range) and testing horizon (also in days)
training_window_size = 100
horizon_size = 30
window_step_size = 1
outputs_folder_name = f"remove-{training_window_size}-{horizon_size}-{window_step_size}" # All of the outputs will be placed in output/outputs_folder_name

In [4]:
df, _ = DL.load_data(
    data_source,
    begin_date,
    end_date,
    fraction,
    hours_offset,
    outputs_folder_name=outputs_folder_name,
    verbose=True,
    perform_eda=True
)

# Step 2. Run clustering
# First, make an instance of the Cluster class and define its settings.
c = Cluster(
    df,  # Input dataset (with latitude, longitude, timestamp columns)
    outputs_folder_name=outputs_folder_name, 
    verbose=True,  # Do we want to see print statements?
    pre_filter=True,  # Apply filters to the data before the clustering (such as removing moving points)
    post_filter=True,  # Apply filters to the data/clusters after the clustering (such as deleting homogeneous clusters)
    filter_moving=True,  # Do we want to delete the data points where the subject was moving?
    centroid_k=10,  # Number of nearest neighbors to consider for density calculation (for cluster centroids)
    min_unique_days=1,  # If post_filter = True, then delete all clusters that have been visited on less than min_unique_days days.
)

# Then we run the clustering and visualisation
df = (
    c.run_clustering(
        min_samples=200,  # The number of samples in a neighborhood for a point to be considered as a core point
        eps=0.01,  # The maximum distance between two samples for one to be considered as in the neighborhood of the other. 0.01 = 10m
        algorithm="dbscan",  # Choose either 'dbscan' or 'hdbscan'. If 'hdbscan', only min_samples is required.
        # min_cluster_size=50,  # Param of HDBSCAN: the minimum size a final cluster can be. The higher this is, the bigger your clusters will be
    )
    .add_locations_to_original_dataframe(
        export_xlsx=False,  # Export the dataframe to excel file? Useful for analyzing.
        name="test",
    )
    .plot_clusters(
        filter_noise=False,  # Remove the -1 labels (i.e., noise) before plotting the clusters
    )
    
    .df  # These functions return 'self' so we can chain them and easily access the df attribute (for input to further modeling/visualization).
)


Message (data loader): Since HOUR_OFFSET > 0, we offset the timestamps with 2 hours.
Message (data loader): Loaded google_maps data from 2022-01-01 to 2022-12-30 with a fraction of 1. Length of data: 150821
Message (data loader): First record in dataset is from 2022-01-01 03:20:48 and last record is from 2022-12-30 23:59:46
Message (data loader): Performing EDA, saving plots at output/remove-100-30-1
Message (filter moving): Marked 39136 data points as moving.
Message (clustering): Clustering 111685 data points with DBSCAN, with eps = 0.01, min_samples = 200. 
Message (clustering): Start clustering...
Message (clustering): Clustering took 23.155152320861816 seconds.
Message (OSM): Adding OSM location data to 18 clusters.
Message (post filter mean std ratio): Deleted 1 clusters (with labels: ['9'])
Message (clustering): Final number of clusters: 17.


In [5]:
c.df

Unnamed: 0,timestamp,latitude,longitude,source,moving,cluster,location,id
0,2022-01-01 03:32:03.461,51.726189,5.307098,WIFI,False,0,"Maaspoortweg, 305",0
1,2022-01-01 04:08:10.808,51.726189,5.307098,WIFI,False,0,"Maaspoortweg, 305",1
2,2022-01-01 05:31:45.348,51.726189,5.307098,WIFI,False,0,"Maaspoortweg, 305",2
3,2022-01-01 12:49:51.094,51.726043,5.307154,WIFI,False,0,"Maaspoortweg, 305",3
4,2022-01-01 12:59:56.046,51.726043,5.307154,WIFI,False,0,"Maaspoortweg, 305",4
...,...,...,...,...,...,...,...,...
48831,2022-12-30 23:53:16.381,51.726057,5.307102,WIFI,False,0,"Maaspoortweg, 305",48831
48832,2022-12-30 23:55:16.442,51.726057,5.307102,WIFI,False,0,"Maaspoortweg, 305",48832
48833,2022-12-30 23:57:16.490,51.726057,5.307102,WIFI,False,0,"Maaspoortweg, 305",48833
48834,2022-12-30 23:57:46.600,51.726057,5.307102,WIFI,False,0,"Maaspoortweg, 305",48834


In [12]:
c.df_centroids.loc[0].latitude


51.726097656501345

In [None]:
c.df_centroids.loc[len(c.df_centroids)] = [0, 0, -1, 10, "black", "noise", 0, 0, 0]

In [10]:
c.df.source.unique()

array(['WIFI', 'GPS', 'UNKNOWN', 'CELL'], dtype=object)