In [2]:
import pandas as pd

In [3]:
dwell_loc = pd.read_csv("data/loc_code_detection_patterns.csv")
location = pd.read_csv("data/location_lat_long.csv")

In [4]:
dwell_loc.head()

Unnamed: 0,tag_id,loc_code,start_date,end_date,dwell_time,number_of_detections
0,989.001007,7A,2020-09-26,2020-09-26,0 days 00:02:05.290000,3
1,989.001033,21,2020-09-27,2020-09-27,0 days 00:00:00,1
2,989.001007,22,2020-10-03,2020-10-03,0 days 00:00:00,2
3,989.001007,232,2020-10-23,2020-10-23,0 days 00:03:49.320000,2
4,989.001007,231,2020-10-23,2020-10-23,0 days 00:06:36.940000,2


In [5]:
location.head()

Unnamed: 0,loc_code,antenna,latitude,longitude,subloc
0,11,11,48.486005,-123.548573,ds
1,12,12,48.486352,-123.548114,us
2,13,hpr,48.461638,-123.555528,us
3,201,01,48.772758,-123.713041,us
4,201,02,48.772758,-123.713041,us


In [6]:
tag_loc = dwell_loc.join(location[['subloc','loc_code']].set_index('loc_code'), on = 'loc_code').drop_duplicates()
tag_loc

Unnamed: 0,tag_id,loc_code,start_date,end_date,dwell_time,number_of_detections,subloc
0,989.001007,7A,2020-09-26,2020-09-26,0 days 00:02:05.290000,3,ds
1,989.001033,21,2020-09-27,2020-09-27,0 days 00:00:00,1,ds
2,989.001007,22,2020-10-03,2020-10-03,0 days 00:00:00,2,ds
3,989.001007,232,2020-10-23,2020-10-23,0 days 00:03:49.320000,2,mid
4,989.001007,231,2020-10-23,2020-10-23,0 days 00:06:36.940000,2,us
...,...,...,...,...,...,...,...
36754,989.001043,44,2025-01-06,2025-01-06,0 days 04:53:32.120000,3,
36755,989.001043,45,2025-01-06,2025-01-07,0 days 14:40:47.490000,3,
36756,989.001043,44,2025-01-07,2025-01-07,0 days 09:10:03.680000,2,
36757,989.001043,45,2025-01-07,2025-01-08,0 days 14:35:53.630000,3,


1. Sequence representation must be maintained (subloc) - use date to keep temporal nature
2. Transform data - subloc(one-hot) and standard scale the rest

In [7]:
tag_loc['dwell_time_sec'] = pd.to_timedelta(tag_loc['dwell_time']).dt.total_seconds()
tag_loc['subloc_encoded'] = tag_loc['subloc'].map({'us': 0, 'ds': 1})
tag_loc

Unnamed: 0,tag_id,loc_code,start_date,end_date,dwell_time,number_of_detections,subloc,dwell_time_sec,subloc_encoded
0,989.001007,7A,2020-09-26,2020-09-26,0 days 00:02:05.290000,3,ds,125.29,1.0
1,989.001033,21,2020-09-27,2020-09-27,0 days 00:00:00,1,ds,0.00,1.0
2,989.001007,22,2020-10-03,2020-10-03,0 days 00:00:00,2,ds,0.00,1.0
3,989.001007,232,2020-10-23,2020-10-23,0 days 00:03:49.320000,2,mid,229.32,
4,989.001007,231,2020-10-23,2020-10-23,0 days 00:06:36.940000,2,us,396.94,0.0
...,...,...,...,...,...,...,...,...,...
36754,989.001043,44,2025-01-06,2025-01-06,0 days 04:53:32.120000,3,,17612.12,
36755,989.001043,45,2025-01-06,2025-01-07,0 days 14:40:47.490000,3,,52847.49,
36756,989.001043,44,2025-01-07,2025-01-07,0 days 09:10:03.680000,2,,33003.68,
36757,989.001043,45,2025-01-07,2025-01-08,0 days 14:35:53.630000,3,,52553.63,


In [8]:
# Sort tag_loc to keep the sequence
tag_loc = tag_loc.sort_values(by=["tag_id", "start_date"])
tag_loc

Unnamed: 0,tag_id,loc_code,start_date,end_date,dwell_time,number_of_detections,subloc,dwell_time_sec,subloc_encoded
18958,989.001006,21,2024-05-30,2024-05-30,0 days 00:00:39.150000,6,ds,39.15,1.0
11029,989.001007,60,2023-09-05,2023-09-05,0 days 00:11:33.050000,42,,693.05,
11035,989.001007,6A,2023-09-05,2023-09-05,0 days 00:00:00,12,ds,0.00,1.0
11036,989.001007,6B,2023-09-05,2023-09-05,0 days 02:25:31.380000,72,us,8731.38,0.0
11050,989.001007,60,2023-09-05,2023-09-08,2 days 22:27:48.420000,50,,253668.42,
...,...,...,...,...,...,...,...,...,...
24864,989.002028,922,2024-10-12,2024-10-12,0 days 01:12:04.850000,24,us,4324.85,0.0
24874,989.002028,921,2024-10-12,2024-10-12,0 days 00:00:00,6,ds,0.00,1.0
25304,989.002028,922,2024-10-14,2024-10-14,0 days 00:00:00,6,us,0.00,0.0
25671,989.002028,9B,2024-10-15,2024-10-15,0 days 00:00:00,6,us,0.00,0.0


In [9]:
# Create sequences of detections
sequence_data = tag_loc.groupby("tag_id").apply(
    lambda x: {
        "subloc_sequence": x["subloc_encoded"].tolist(),
        "detection_counts": x["number_of_detections"].tolist(),
        "dwell_times": x["dwell_time_sec"].tolist(),
    }
).reset_index()

print(sequence_data)

          tag_id                                                  0
0     989.001006  {'subloc_sequence': [1.0], 'detection_counts':...
1     989.001007  {'subloc_sequence': [nan, 1.0, 0.0, nan], 'det...
2     989.001007  {'subloc_sequence': [1.0, 1.0, 0.0], 'detectio...
3     989.001007  {'subloc_sequence': [1.0, 0.0, 0.0, 1.0], 'det...
4     989.001007  {'subloc_sequence': [nan, 1.0, 0.0], 'detectio...
...          ...                                                ...
6781  989.002028  {'subloc_sequence': [nan], 'detection_counts':...
6782  989.002028  {'subloc_sequence': [nan, 0.0, 1.0, 0.0], 'det...
6783  989.002028  {'subloc_sequence': [nan, 1.0, 0.0, 1.0, 0.0],...
6784  989.002028  {'subloc_sequence': [nan], 'detection_counts':...
6785  989.002028  {'subloc_sequence': [nan, 0.0, 1.0, 0.0, 0.0, ...

[6786 rows x 2 columns]


In [10]:
# Convert into a dataframe (for better viewing)
sequence_df = pd.DataFrame(sequence_data)
sequence_df["subloc_sequence"] = sequence_df[0].apply(lambda x: x["subloc_sequence"])
sequence_df["detection_counts"] = sequence_df[0].apply(lambda x: x["detection_counts"])
sequence_df["dwell_times"] = sequence_df[0].apply(lambda x: x["dwell_times"])
sequence_df = sequence_df.drop(columns=[0])

sequence_df

Unnamed: 0,tag_id,subloc_sequence,detection_counts,dwell_times
0,989.001006,[1.0],[6],[39.15]
1,989.001007,"[nan, 1.0, 0.0, nan]","[42, 12, 72, 50]","[693.05, 0.0, 8731.38, 253668.42]"
2,989.001007,"[1.0, 1.0, 0.0]","[40, 4, 4]","[1043247.59, 0.0, 0.0]"
3,989.001007,"[1.0, 0.0, 0.0, 1.0]","[1, 6, 14, 7]","[0.0, 931.48, 790.26, 96064.64]"
4,989.001007,"[nan, 1.0, 0.0]","[30, 12, 36]","[68.92, 0.0, 425.22]"
...,...,...,...,...
6781,989.002028,[nan],[2],[86310.19]
6782,989.002028,"[nan, 0.0, 1.0, 0.0]","[2, 6, 6, 6]","[0.0, 0.0, 0.0, 0.0]"
6783,989.002028,"[nan, 1.0, 0.0, 1.0, 0.0]","[1, 6, 12, 12, 12]","[0.0, 0.0, 157.54, 1996.62, 68.71]"
6784,989.002028,[nan],[1],[0.0]


Save the dataset for manual inspection and analysis

In [11]:
sequence_df.to_csv("data/sequence_df.csv", index = False, float_format='%.15g')

In [12]:
# Removing all NaN (null) values
def contains_nan(cell):
    if isinstance(cell, list):  # if the cell is a list
        return any(pd.isna(x) for x in cell)
    return pd.isna(cell)

clean_sequence = sequence_df[~sequence_df[['subloc_sequence', 'detection_counts', 'dwell_times']].applymap(contains_nan).any(axis=1)]
clean_sequence

  clean_sequence = sequence_df[~sequence_df[['subloc_sequence', 'detection_counts', 'dwell_times']].applymap(contains_nan).any(axis=1)]


Unnamed: 0,tag_id,subloc_sequence,detection_counts,dwell_times
0,989.001006,[1.0],[6],[39.15]
2,989.001007,"[1.0, 1.0, 0.0]","[40, 4, 4]","[1043247.59, 0.0, 0.0]"
3,989.001007,"[1.0, 0.0, 0.0, 1.0]","[1, 6, 14, 7]","[0.0, 931.48, 790.26, 96064.64]"
5,989.001007,"[1.0, 0.0]","[2, 2]","[0.0, 0.0]"
6,989.001007,"[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...","[30, 4, 2, 4, 20, 6, 6, 6, 6, 18, 4, 10, 6, 6,...","[4459266.45, 1520.36, 0.0, 18800.02, 18782.29,..."
...,...,...,...,...
6757,989.002028,"[1.0, 0.0, 0.0, 1.0]","[3, 9, 3, 3]","[0.0, 4173.13, 0.0, 0.0]"
6758,989.002028,"[1.0, 0.0, 0.0, 1.0]","[14, 7, 7, 7]","[0.16, 0.0, 0.0, 0.0]"
6761,989.002028,"[0.0, 1.0, 0.0]","[5, 5, 5]","[0.0, 0.0, 0.0]"
6767,989.002028,"[0.0, 1.0]","[3, 3]","[0.0, 0.0]"


The goal is to create similarity scores of sorts. The magnitude of the score should be able to tell us about the relative movement of the tag. This would need multiplication/product operations. We have 0s in our dataset that holds meaning (in subloc it means upsteam and in dwell_times it means that the tag did not spend much time there). Multiplication could mean that we lose those factors. So, replace 0 with a near 0 value like a negative exponent of 1 (0.01 for example), so that we can keep the essence of the value and not lose the factor.

In [13]:
def replace_zeros(cell):
    if isinstance(cell, list):
        return [0.01 if x == 0 else x for x in cell]
    return cell

columns_to_replace = ['subloc_sequence', 'detection_counts', 'dwell_times']
sequence_updated = clean_sequence.copy()
sequence_updated[columns_to_replace] = sequence_updated[columns_to_replace].applymap(replace_zeros)
sequence_updated

  sequence_updated[columns_to_replace] = sequence_updated[columns_to_replace].applymap(replace_zeros)


Unnamed: 0,tag_id,subloc_sequence,detection_counts,dwell_times
0,989.001006,[1.0],[6],[39.15]
2,989.001007,"[1.0, 1.0, 0.01]","[40, 4, 4]","[1043247.59, 0.01, 0.01]"
3,989.001007,"[1.0, 0.01, 0.01, 1.0]","[1, 6, 14, 7]","[0.01, 931.48, 790.26, 96064.64]"
5,989.001007,"[1.0, 0.01]","[2, 2]","[0.01, 0.01]"
6,989.001007,"[0.01, 1.0, 0.01, 1.0, 0.01, 1.0, 0.01, 1.0, 0...","[30, 4, 2, 4, 20, 6, 6, 6, 6, 18, 4, 10, 6, 6,...","[4459266.45, 1520.36, 0.01, 18800.02, 18782.29..."
...,...,...,...,...
6757,989.002028,"[1.0, 0.01, 0.01, 1.0]","[3, 9, 3, 3]","[0.01, 4173.13, 0.01, 0.01]"
6758,989.002028,"[1.0, 0.01, 0.01, 1.0]","[14, 7, 7, 7]","[0.16, 0.01, 0.01, 0.01]"
6761,989.002028,"[0.01, 1.0, 0.01]","[5, 5, 5]","[0.01, 0.01, 0.01]"
6767,989.002028,"[0.01, 1.0]","[3, 3]","[0.01, 0.01]"


Similarity score:
1. dwell time/count gives us time per detection in seconds
2. Multiply that to the subloc to inflate and deflate the similarity score

In [14]:
# Calculation of similarity score
result = []
for x, y, z in zip(sequence_updated['subloc_sequence'][2], sequence_updated['detection_counts'][2], sequence_updated['dwell_times'][2]):
    prod = x*(z/y)
    result.append(prod)

sum(result)


26081.192274999998

In [15]:
def compute_similarity(row):
        similarity_score = sum(
            x * (z/y) for x, y, z in zip(row['subloc_sequence'], row['detection_counts'], row['dwell_times'])
        )
        return similarity_score
    
sequence_updated['similarity_score'] = sequence_updated.apply(compute_similarity, axis=1)
sequence_updated

Unnamed: 0,tag_id,subloc_sequence,detection_counts,dwell_times,similarity_score
0,989.001006,[1.0],[6],[39.15],6.525000
2,989.001007,"[1.0, 1.0, 0.01]","[40, 4, 4]","[1043247.59, 0.01, 0.01]",26081.192275
3,989.001007,"[1.0, 0.01, 0.01, 1.0]","[1, 6, 14, 7]","[0.01, 931.48, 790.26, 96064.64]",13725.646938
5,989.001007,"[1.0, 0.01]","[2, 2]","[0.01, 0.01]",0.005050
6,989.001007,"[0.01, 1.0, 0.01, 1.0, 0.01, 1.0, 0.01, 1.0, 0...","[30, 4, 2, 4, 20, 6, 6, 6, 6, 18, 4, 10, 6, 6,...","[4459266.45, 1520.36, 0.01, 18800.02, 18782.29...",132197.322424
...,...,...,...,...,...
6757,989.002028,"[1.0, 0.01, 0.01, 1.0]","[3, 9, 3, 3]","[0.01, 4173.13, 0.01, 0.01]",4.643511
6758,989.002028,"[1.0, 0.01, 0.01, 1.0]","[14, 7, 7, 7]","[0.16, 0.01, 0.01, 0.01]",0.012886
6761,989.002028,"[0.01, 1.0, 0.01]","[5, 5, 5]","[0.01, 0.01, 0.01]",0.002040
6767,989.002028,"[0.01, 1.0]","[3, 3]","[0.01, 0.01]",0.003367


In [16]:
sorted_sequence = sequence_updated.sort_values(by='similarity_score', ascending = False)
sorted_sequence

Unnamed: 0,tag_id,subloc_sequence,detection_counts,dwell_times,similarity_score
5369,989.001042,"[1.0, 0.01]","[3, 4]","[74414242.09, 153.51]",2.480475e+07
1592,989.001039,"[1.0, 0.01]","[3, 3]","[49457490.48, 0.01]",1.648583e+07
1612,989.001039,"[0.01, 1.0, 0.01]","[3, 3, 6]","[0.01, 46623226.53, 1482.08]",1.554108e+07
1609,989.001039,"[0.01, 1.0, 0.01]","[3, 3, 39]","[0.01, 45061095.24, 2811238.57]",1.502109e+07
3200,989.001041,"[1.0, 0.01, 1.0, 0.01]","[4, 3, 2, 2]","[48331116.81, 0.01, 0.01, 0.01]",1.208278e+07
...,...,...,...,...,...
1445,989.001039,[0.01],[12],[0.01],8.333333e-06
1378,989.001039,[0.01],[12],[0.01],8.333333e-06
5899,989.001043,[0.01],[13],[0.01],7.692308e-06
177,989.001034,[0.01],[13],[0.01],7.692308e-06


The scores are huge for long dwell times, dividing the score by 86400 (60x60x24) to convert the time from seconds to days.

In [17]:
sorted_sequence['similarity(in_days)'] = sorted_sequence['similarity_score']/86400
sorted_sequence

Unnamed: 0,tag_id,subloc_sequence,detection_counts,dwell_times,similarity_score,similarity(in_days)
5369,989.001042,"[1.0, 0.01]","[3, 4]","[74414242.09, 153.51]",2.480475e+07,2.870920e+02
1592,989.001039,"[1.0, 0.01]","[3, 3]","[49457490.48, 0.01]",1.648583e+07,1.908082e+02
1612,989.001039,"[0.01, 1.0, 0.01]","[3, 3, 6]","[0.01, 46623226.53, 1482.08]",1.554108e+07,1.798736e+02
1609,989.001039,"[0.01, 1.0, 0.01]","[3, 3, 39]","[0.01, 45061095.24, 2811238.57]",1.502109e+07,1.738552e+02
3200,989.001041,"[1.0, 0.01, 1.0, 0.01]","[4, 3, 2, 2]","[48331116.81, 0.01, 0.01, 0.01]",1.208278e+07,1.398470e+02
...,...,...,...,...,...,...
1445,989.001039,[0.01],[12],[0.01],8.333333e-06,9.645062e-11
1378,989.001039,[0.01],[12],[0.01],8.333333e-06,9.645062e-11
5899,989.001043,[0.01],[13],[0.01],7.692308e-06,8.903134e-11
177,989.001034,[0.01],[13],[0.01],7.692308e-06,8.903134e-11


In [18]:
sorted_sequence.to_csv("data/sequence_similarity.csv", index = False, float_format='%.15g')