In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

import os
import json
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
data = []
for filename in os.listdir('../data/json/trips/'):
    with open(os.path.join('../data/json/trips/', filename)) as f:
        data.extend([json.loads(line) for line in f])

df = pd.DataFrame(data)
df = df[['playerId', 'startTime', 'TerritoryId', 'modeType', 'distance']]
df.drop_duplicates(inplace=True)
df['TerritoryId'] = df['TerritoryId'].apply(lambda x : 'Lecco' if x == 'L' else x)
df['startTime'] = pd.to_datetime(df['startTime'])
df['distance'] = df['distance'].astype(int)

#unique_timestamps = df['startTime'].unique().astype('int64')
#unique_timestamps = np.sort(unique_timestamps)

#df['time_id'] = df['startTime'].astype('int64').progress_apply(lambda x : np.where(unique_timestamps == x)[0][0])

df.to_parquet('../data/parquet/trips.parquet', index=False, compression='gzip')
df.sample(10)

  0%|          | 0/67592 [00:00<?, ?it/s]

Unnamed: 0,playerId,startTime,TerritoryId,modeType,distance,time_id
5390,u_ea2c27c00c9b4a42b437c011b33c88aa,2023-09-13 09:15:50.891,Ferrara,train,23781,65242
16035,u_83191d0787c94b2180ddacfff8f1a724,2023-07-19 03:52:42.343,Ferrara,walk,1874,54597
24928,u_caafbc4e-b461-4880-ba8d-5f939deb963b,2023-06-07 10:35:57.825,Ferrara,bike,2520,45704
59560,u_8f20f2024b9e491cb01509df6147dd5f,2023-03-11 11:01:23.035,Lecco,walk,537,8610
18220,u_6e48c3e404b74b87802d70ced9ad086e,2023-07-07 14:07:28.030,Ferrara,walk,370,52412
27130,u_6c33255c-7892-4e97-b2a2-202457e71f33,2023-05-29 04:12:20.014,Ferrara,bike,6536,43338
57056,u_fe75e3dcba864c8e87f45e145bc82434,2023-03-16 06:28:03.518,Lecco,bus,3518,11114
60196,u_d124fa5ebf4347c29898d2d75bce5438,2023-03-10 11:57:34.394,Lecco,walk,603,7974
3047,u_b227c40c79c4452da98952e6ffb08d71,2023-09-24 19:56:41.274,Ferrara,bike,721,67585
4172,u_caafbc4e-b461-4880-ba8d-5f939deb963b,2023-09-19 12:34:11.935,Ferrara,bike,2523,66460


In [3]:
raw_data = pd.read_parquet('../data/parquet/features.parquet')
raw_data['startTime'] = pd.to_datetime(raw_data['startTime'])
raw_data.sort_values(by=['playerId', 'time_id'], inplace=True)

print('Dataset shape:', raw_data.shape)

Dataset shape: (67592, 6)


In [4]:
SEQUENCE_LENGTH = 30
def gen_sequence(x):
    if len(x) < SEQUENCE_LENGTH:
        return [x[:i] for i in range(1, len(x)+1)]
    else:
        return [x[i:i+SEQUENCE_LENGTH] for i in range(len(x)-SEQUENCE_LENGTH+1)]

data_by_player = raw_data.groupby('playerId').agg(list).reset_index()
data_by_player['TerritoryId'] = data_by_player['TerritoryId'].apply(lambda x: x[0])
print('Grouped dataset shape:', data_by_player.shape)

columns = ['startTime', 'modeType', 'distance', 'time_id']
for c in columns:
    data_by_player[f"{c}"] = data_by_player[c].map(gen_sequence)
display(data_by_player.head())

Grouped dataset shape: (914, 6)


Unnamed: 0,playerId,startTime,TerritoryId,modeType,distance,time_id
0,u_00144002f1614ee9a45f7822760e3746,[[2023-03-04 08:43:33.003000]],Lecco,[[walk]],[[1206]],[[4220]]
1,u_00567a7bce8c4d09bea7db9bae375af4,"[[2023-02-27 11:59:34.139000, 2023-02-28 12:32...",Lecco,"[[walk, walk, bus, walk, walk, walk, walk, wal...","[[1048, 448, 6484, 487, 1020, 492, 1807, 1019,...","[[233, 1055, 1868, 1938, 5314, 7387, 8072, 871..."
2,u_013177350075415aa939d81131f8d0a0,"[[2023-02-27 09:33:29.735000, 2023-02-27 12:54...",Lecco,"[[walk, walk, walk, walk, walk, walk, walk, wa...","[[1263, 1192, 891, 1251, 602, 842, 1242, 231, ...","[[141, 402, 838, 2844, 3060, 3419, 5144, 5380,..."
3,u_0160059b315d4a9087e16cc31f7c7695,"[[2023-07-06 13:24:49.604000], [2023-07-06 13:...",Ferrara,"[[bike], [bike, bike], [bike, bike, bike], [bi...","[[7023], [7023, 5729], [7023, 5729, 6982], [70...","[[52134], [52134, 52784], [52134, 52784, 63231..."
4,u_016cbcbd4f8f4fc18aa4b322b77ed603,"[[2023-03-01 07:28:15.209000, 2023-03-01 08:12...",Lecco,"[[walk, walk, walk, walk, walk, walk, walk, wa...","[[2943, 2863, 544, 767, 782, 1060, 536, 1315, ...","[[1665, 1699, 1824, 2213, 2452, 2583, 3024, 33..."


In [5]:
df = data_by_player.set_index(['playerId', 'TerritoryId']).apply(pd.Series.explode).reset_index()
df['sequenceId'] = df.index
df.drop(columns=['playerId'], inplace=True)
display(df.head(20))
print('Oversampled dataset shape:', df.shape)

Unnamed: 0,TerritoryId,startTime,modeType,distance,time_id,sequenceId
0,Lecco,[2023-03-04 08:43:33.003000],[walk],[1206],[4220],0
1,Lecco,"[2023-02-27 11:59:34.139000, 2023-02-28 12:32:...","[walk, walk, bus, walk, walk, walk, walk, walk...","[1048, 448, 6484, 487, 1020, 492, 1807, 1019, ...","[233, 1055, 1868, 1938, 5314, 7387, 8072, 8713...",1
2,Lecco,"[2023-02-27 09:33:29.735000, 2023-02-27 12:54:...","[walk, walk, walk, walk, walk, walk, walk, wal...","[1263, 1192, 891, 1251, 602, 842, 1242, 231, 8...","[141, 402, 838, 2844, 3060, 3419, 5144, 5380, ...",2
3,Lecco,"[2023-02-27 12:54:19.084000, 2023-02-28 09:42:...","[walk, walk, walk, walk, walk, walk, walk, wal...","[1192, 891, 1251, 602, 842, 1242, 231, 848, 11...","[402, 838, 2844, 3060, 3419, 5144, 5380, 5914,...",3
4,Lecco,"[2023-02-28 09:42:31.033000, 2023-03-02 12:54:...","[walk, walk, walk, walk, walk, walk, walk, wal...","[891, 1251, 602, 842, 1242, 231, 848, 1101, 11...","[838, 2844, 3060, 3419, 5144, 5380, 5914, 7230...",4
5,Lecco,"[2023-03-02 12:54:52.477000, 2023-03-02 18:53:...","[walk, walk, walk, walk, walk, walk, walk, wal...","[1251, 602, 842, 1242, 231, 848, 1101, 1191, 3...","[2844, 3060, 3419, 5144, 5380, 5914, 7230, 745...",5
6,Lecco,"[2023-03-02 18:53:54.868000, 2023-03-03 09:13:...","[walk, walk, walk, walk, walk, walk, walk, wal...","[602, 842, 1242, 231, 848, 1101, 1191, 309, 25...","[3060, 3419, 5144, 5380, 5914, 7230, 7453, 780...",6
7,Lecco,"[2023-03-03 09:13:29.826000, 2023-03-06 09:17:...","[walk, walk, walk, walk, walk, walk, walk, wal...","[842, 1242, 231, 848, 1101, 1191, 309, 2527, 1...","[3419, 5144, 5380, 5914, 7230, 7453, 7808, 813...",7
8,Lecco,"[2023-03-06 09:17:14.039000, 2023-03-06 12:47:...","[walk, walk, walk, walk, walk, walk, walk, wal...","[1242, 231, 848, 1101, 1191, 309, 2527, 1293, ...","[5144, 5380, 5914, 7230, 7453, 7808, 8132, 849...",8
9,Lecco,"[2023-03-06 12:47:31.389000, 2023-03-07 09:37:...","[walk, walk, walk, walk, walk, walk, walk, wal...","[231, 848, 1101, 1191, 309, 2527, 1293, 1018, ...","[5380, 5914, 7230, 7453, 7808, 8132, 8498, 890...",9


Oversampled dataset shape: (54948, 6)


In [6]:
df1 = df.copy()
df1 = df1.set_index(['TerritoryId', 'sequenceId']).apply(pd.Series.explode).reset_index()
print(df1.shape)

(1547602, 6)


In [7]:
from src.serializer_visitor import SerializerVisitor
from src.feature import SimpleFeature, CompositeFeature

class PromptT5Serializer(SerializerVisitor):
    def serializeCompositeFeature(self, feature: CompositeFeature, *args, **kwargs):
        if hasattr(feature, 'is_player'):
            return self.__serializePlayer(feature)
        
        return self.__serializeTrip(feature)

    def __serializePlayer(self, player: CompositeFeature):
        locality = f"City: {player.location.serialize(self)}" # type: ignore
        trips = ', '.join([trip.serialize(self) for trip in player.trips.children]) # type: ignore

        if len(trips) == 0:
            return f"{locality}. No trips."
        
        return f"{locality}. Trips: {trips}."

    def __serializeTrip(self, trip: CompositeFeature):
        return f"(Time: {trip.time_id.serialize(self)}. Vehicle: {trip.vehicle.serialize(self)}. Distance: {trip.distance.serialize(self)})"

class TargetPromptSerializer(SerializerVisitor):
    def serializeCompositeFeature(self, feature, *args, **kwargs):
        return f"{feature.vehicle.serialize(self)}"

In [8]:
def serializeFeature(df: pd.DataFrame):
    trips = [CompositeFeature(
        vehicle=SimpleFeature(v),
        distance=SimpleFeature(d),
        startTime=SimpleFeature(s),
        time_id=SimpleFeature(t_id)
    ) for v, d, s, t_id in zip(df['modeType'], df['distance'], df['startTime'], df['time_id'])]

    target = trips[-1]
    trips = trips[:-1]
    
    player = CompositeFeature(
        is_player=SimpleFeature(True),
        location=SimpleFeature(df['TerritoryId']),
        trips=CompositeFeature(trips)
    )
    return player.serialize(PromptT5Serializer()), target.serialize(TargetPromptSerializer())

dataset = df.progress_apply(serializeFeature, axis=1)
dataset = pd.DataFrame(dataset.tolist(), columns=['text', 'target'])

dataset.to_parquet('../data/parquet/dataset.parquet', index=False, compression='gzip')

dataset.head()

  0%|          | 0/54948 [00:00<?, ?it/s]

Unnamed: 0,text,target
0,City: Lecco. No trips.,walk
1,City: Lecco. Trips: (Time: 233. Vehicle: walk....,walk
2,City: Lecco. Trips: (Time: 141. Vehicle: walk....,walk
3,City: Lecco. Trips: (Time: 402. Vehicle: walk....,walk
4,City: Lecco. Trips: (Time: 838. Vehicle: walk....,walk
