In [1]:
import pandas as pd

In [2]:
dwell_loc = pd.read_csv("data/loc_code_detection_patterns.csv")
location = pd.read_csv("data/location_lat_long.csv")

In [3]:
dwell_loc.head()

Unnamed: 0,tag_id,loc_code,start_date,end_date,dwell_time,number_of_detections
0,989.001007,7A,2020-09-26,2020-09-26,0 days 00:02:05.290000,3
1,989.001033,21,2020-09-27,2020-09-27,0 days 00:00:00,1
2,989.001007,22,2020-10-03,2020-10-03,0 days 00:00:00,2
3,989.001007,232,2020-10-23,2020-10-23,0 days 00:03:49.320000,2
4,989.001007,231,2020-10-23,2020-10-23,0 days 00:06:36.940000,2


In [4]:
location.head()

Unnamed: 0,loc_code,antenna,latitude,longitude,subloc
0,11,11,48.486005,-123.548573,ds
1,12,12,48.486352,-123.548114,us
2,13,hpr,48.461638,-123.555528,us
3,201,01,48.772758,-123.713041,us
4,201,02,48.772758,-123.713041,us


In [5]:
tag_loc = dwell_loc.join(location[['subloc','loc_code']].set_index('loc_code'), on = 'loc_code').drop_duplicates()
tag_loc

Unnamed: 0,tag_id,loc_code,start_date,end_date,dwell_time,number_of_detections,subloc
0,989.001007,7A,2020-09-26,2020-09-26,0 days 00:02:05.290000,3,ds
1,989.001033,21,2020-09-27,2020-09-27,0 days 00:00:00,1,ds
2,989.001007,22,2020-10-03,2020-10-03,0 days 00:00:00,2,ds
3,989.001007,232,2020-10-23,2020-10-23,0 days 00:03:49.320000,2,mid
4,989.001007,231,2020-10-23,2020-10-23,0 days 00:06:36.940000,2,us
...,...,...,...,...,...,...,...
36754,989.001043,44,2025-01-06,2025-01-06,0 days 04:53:32.120000,3,
36755,989.001043,45,2025-01-06,2025-01-07,0 days 14:40:47.490000,3,
36756,989.001043,44,2025-01-07,2025-01-07,0 days 09:10:03.680000,2,
36757,989.001043,45,2025-01-07,2025-01-08,0 days 14:35:53.630000,3,


1. Sequence representation must be maintained (subloc) - use date to keep temporal nature
2. Transform data - subloc(one-hot) and standard scale the rest

In [6]:
tag_loc['dwell_time_sec'] = pd.to_timedelta(tag_loc['dwell_time']).dt.total_seconds()
tag_loc['subloc_encoded'] = tag_loc['subloc'].map({'us': 0, 'ds': 1})
tag_loc

Unnamed: 0,tag_id,loc_code,start_date,end_date,dwell_time,number_of_detections,subloc,dwell_time_sec,subloc_encoded
0,989.001007,7A,2020-09-26,2020-09-26,0 days 00:02:05.290000,3,ds,125.29,1.0
1,989.001033,21,2020-09-27,2020-09-27,0 days 00:00:00,1,ds,0.00,1.0
2,989.001007,22,2020-10-03,2020-10-03,0 days 00:00:00,2,ds,0.00,1.0
3,989.001007,232,2020-10-23,2020-10-23,0 days 00:03:49.320000,2,mid,229.32,
4,989.001007,231,2020-10-23,2020-10-23,0 days 00:06:36.940000,2,us,396.94,0.0
...,...,...,...,...,...,...,...,...,...
36754,989.001043,44,2025-01-06,2025-01-06,0 days 04:53:32.120000,3,,17612.12,
36755,989.001043,45,2025-01-06,2025-01-07,0 days 14:40:47.490000,3,,52847.49,
36756,989.001043,44,2025-01-07,2025-01-07,0 days 09:10:03.680000,2,,33003.68,
36757,989.001043,45,2025-01-07,2025-01-08,0 days 14:35:53.630000,3,,52553.63,


In [7]:
# Sort tag_loc to keep the sequence
tag_loc = tag_loc.sort_values(by=["tag_id", "start_date"])
tag_loc

Unnamed: 0,tag_id,loc_code,start_date,end_date,dwell_time,number_of_detections,subloc,dwell_time_sec,subloc_encoded
18958,989.001006,21,2024-05-30,2024-05-30,0 days 00:00:39.150000,6,ds,39.15,1.0
11029,989.001007,60,2023-09-05,2023-09-05,0 days 00:11:33.050000,42,,693.05,
11035,989.001007,6A,2023-09-05,2023-09-05,0 days 00:00:00,12,ds,0.00,1.0
11036,989.001007,6B,2023-09-05,2023-09-05,0 days 02:25:31.380000,72,us,8731.38,0.0
11050,989.001007,60,2023-09-05,2023-09-08,2 days 22:27:48.420000,50,,253668.42,
...,...,...,...,...,...,...,...,...,...
24864,989.002028,922,2024-10-12,2024-10-12,0 days 01:12:04.850000,24,us,4324.85,0.0
24874,989.002028,921,2024-10-12,2024-10-12,0 days 00:00:00,6,ds,0.00,1.0
25304,989.002028,922,2024-10-14,2024-10-14,0 days 00:00:00,6,us,0.00,0.0
25671,989.002028,9B,2024-10-15,2024-10-15,0 days 00:00:00,6,us,0.00,0.0


In [8]:
# Create sequences of detections
sequence_data = tag_loc.groupby("tag_id").apply(
    lambda x: {
        "subloc_sequence": x["subloc_encoded"].tolist(),
        "detection_counts": x["number_of_detections"].tolist(),
        "dwell_times": x["dwell_time_sec"].tolist(),
    }
).reset_index()

print(sequence_data)

          tag_id                                                  0
0     989.001006  {'subloc_sequence': [1.0], 'detection_counts':...
1     989.001007  {'subloc_sequence': [nan, 1.0, 0.0, nan], 'det...
2     989.001007  {'subloc_sequence': [1.0, 1.0, 0.0], 'detectio...
3     989.001007  {'subloc_sequence': [1.0, 0.0, 0.0, 1.0], 'det...
4     989.001007  {'subloc_sequence': [nan, 1.0, 0.0], 'detectio...
...          ...                                                ...
6781  989.002028  {'subloc_sequence': [nan], 'detection_counts':...
6782  989.002028  {'subloc_sequence': [nan, 0.0, 1.0, 0.0], 'det...
6783  989.002028  {'subloc_sequence': [nan, 1.0, 0.0, 1.0, 0.0],...
6784  989.002028  {'subloc_sequence': [nan], 'detection_counts':...
6785  989.002028  {'subloc_sequence': [nan, 0.0, 1.0, 0.0, 0.0, ...

[6786 rows x 2 columns]


In [None]:
# Convert into a dataframe (for better viewing)
sequence_df = pd.DataFrame(sequence_data)
sequence_df["subloc_sequence"] = sequence_df[0].apply(lambda x: x["subloc_sequence"])
sequence_df["detection_counts"] = sequence_df[0].apply(lambda x: x["detection_counts"])
sequence_df["dwell_times"] = sequence_df[0].apply(lambda x: x["dwell_times"])
sequence_df = sequence_df.drop(columns=[0])

sequence_df

Unnamed: 0,tag_id,subloc_sequence,detection_counts,dwell_times
0,989.001006,[1.0],[6],[39.15]
1,989.001007,"[nan, 1.0, 0.0, nan]","[42, 12, 72, 50]","[693.05, 0.0, 8731.38, 253668.42]"
2,989.001007,"[1.0, 1.0, 0.0]","[40, 4, 4]","[1043247.59, 0.0, 0.0]"
3,989.001007,"[1.0, 0.0, 0.0, 1.0]","[1, 6, 14, 7]","[0.0, 931.48, 790.26, 96064.64]"
4,989.001007,"[nan, 1.0, 0.0]","[30, 12, 36]","[68.92, 0.0, 425.22]"
...,...,...,...,...
6781,989.002028,[nan],[2],[86310.19]
6782,989.002028,"[nan, 0.0, 1.0, 0.0]","[2, 6, 6, 6]","[0.0, 0.0, 0.0, 0.0]"
6783,989.002028,"[nan, 1.0, 0.0, 1.0, 0.0]","[1, 6, 12, 12, 12]","[0.0, 0.0, 157.54, 1996.62, 68.71]"
6784,989.002028,[nan],[1],[0.0]


Unsupervised clustering:

problem is that some sequences have 6k+ detection events + bugs, try a manual approach

In [20]:
# Combine sequences into weighted arrays

import numpy as np

def combine_features(row):
    return np.array(row["subloc_sequence"]) * np.array(row["detection_counts"]) + np.array(row["dwell_times"])

# Define a fixed sequence length
fixed_length = 10

# Truncate or pad each sequence to the fixed length
def truncate_or_pad(seq, length):
    return seq[:length] if len(seq) > length else seq + [0] * (length - len(seq))

sequence_df["combined_features"] = sequence_df["combined_features"].apply(lambda x: truncate_or_pad(x, fixed_length))

# Convert to NumPy array
time_series = np.array(sequence_df["combined_features"].tolist())
scaler = TimeSeriesScalerMeanVariance()
scaled_series = scaler.fit_transform(time_series)

# Apply K-Means with DTW
model = TimeSeriesKMeans(n_clusters=2, metric="dtw", random_state=42)
sequence_df["cluster"] = model.fit_predict(scaled_series)

# Display results
print(sequence_df[["tag_id", "cluster"]])

ValueError: operands could not be broadcast together with shapes (4,) (6,) 