In [7]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import sys
sys.path.append('../../')

tqdm.pandas()

In [8]:
df_trips = pd.read_parquet('../../out/parquet/trips.parquet')
df_trips['playerId'] = df_trips['playerId'].astype(str)
df_trips['TerritoryId'] = df_trips['TerritoryId'].astype(str)
df_trips['startTime'] = pd.to_datetime(df_trips['startTime'])
df_trips.sort_values(by=['playerId', 'startTime'],
                     inplace=True, ignore_index=True)

df_cities = df_trips[['playerId', 'TerritoryId']].drop_duplicates().reset_index(drop=True)

df_trips.drop(columns=['TerritoryId'], inplace=True)
df_trips = df_trips.groupby(['playerId', 'modeType', pd.Grouper(key='startTime', freq='W')]).agg(
    distance=('distance', lambda x: np.sum(x) // 1000),
).reset_index()

df_trips = pd.get_dummies(df_trips, columns=['modeType'], dtype=int, prefix='', prefix_sep='')
df_trips[df_trips.columns[3:]] = df_trips[df_trips.columns[3:]].multiply(df_trips['distance'], axis=0)
df_trips.drop(columns=['distance'], inplace=True)

df_trips = df_trips.groupby(['playerId', 'startTime']).sum().reset_index()
df_trips['startTime'] = df_trips['startTime'] + pd.Timedelta(weeks=1)

df_trips

Unnamed: 0,playerId,startTime,bike,bus,car,train,walk
0,u_00144002f1614ee9a45f7822760e3746,2023-03-12,0,0,0,0,1
1,u_00567a7bce8c4d09bea7db9bae375af4,2023-03-12,0,6,0,0,1
2,u_00567a7bce8c4d09bea7db9bae375af4,2023-03-19,0,0,0,0,8
3,u_00567a7bce8c4d09bea7db9bae375af4,2023-03-26,0,0,0,45,1
4,u_00567a7bce8c4d09bea7db9bae375af4,2023-04-02,0,0,0,0,1
...,...,...,...,...,...,...,...
6561,u_ffe87d71a9ee4521bddf686053b7f8b7,2023-05-07,0,0,0,0,6
6562,u_ffe87d71a9ee4521bddf686053b7f8b7,2023-05-14,0,0,0,0,7
6563,u_ffe87d71a9ee4521bddf686053b7f8b7,2023-05-21,0,0,0,0,2
6564,u_ffe87d71a9ee4521bddf686053b7f8b7,2023-05-28,0,0,0,0,2


In [9]:
df_cities

Unnamed: 0,playerId,TerritoryId
0,u_00144002f1614ee9a45f7822760e3746,L
1,u_00567a7bce8c4d09bea7db9bae375af4,L
2,u_013177350075415aa939d81131f8d0a0,L
3,u_0160059b315d4a9087e16cc31f7c7695,Ferrara
4,u_016cbcbd4f8f4fc18aa4b322b77ed603,L
...,...,...
909,u_fe75e3dcba864c8e87f45e145bc82434,L
910,u_ff1b54675a9f4b72ba89e417c0300c0d,Ferrara
911,u_ff28a7ac5f9042e8ae8c941069db8ed0,L
912,u_ffd715ba4f4b4fb8b792a0cb44a2579e,L


In [10]:
df_challenges = pd.read_parquet('../../out/parquet/individual-challenge.parquet')
df_challenges['playerId'] = df_challenges['playerId'].astype(str)
df_challenges['startTime'] = pd.to_datetime(df_challenges['startTime'])

#df_challenges = df_challenges.groupby(['playerId', pd.Grouper(key='startTime', freq='W')]).agg(list).reset_index()

print(df_challenges.shape)
df_challenges

(6016, 6)


Unnamed: 0,playerId,startTime,counterName,target,periodTarget,state
0,u_0bea6988-bd00-4aa6-a456-4285744356ee,2023-04-23,Walk_Km,1,0,COMPLETED
1,u_0bea6988-bd00-4aa6-a456-4285744356ee,2023-04-23,green_leaves,30,2,COMPLETED
2,u_1636dfdc-fbcc-4068-8fcd-3293369c3a82,2023-04-23,Walk_Km,1,0,COMPLETED
3,u_1636dfdc-fbcc-4068-8fcd-3293369c3a82,2023-04-23,green_leaves,30,2,COMPLETED
4,u_2fe7aac8-07da-4d38-8b0a-978be1986ebf,2023-04-23,Walk_Km,1,0,COMPLETED
...,...,...,...,...,...,...
6243,u_f4b95ec9-5f53-48d8-8adf-fa61be00c967,2023-09-17,Bike_Km,1,0,COMPLETED
6244,u_f4b95ec9-5f53-48d8-8adf-fa61be00c967,2023-09-17,green_leaves,3,5,COMPLETED
6248,u_f6a7cd70958e448f829591bbf6a90ec8,2023-09-17,green_leaves,30,2,COMPLETED
6250,u_f9994c4795f34970addeb5d3ca8ed1ab,2023-09-17,green_leaves,45,2,COMPLETED


In [11]:
df = pd.merge(df_trips, df_challenges, on=['playerId', 'startTime'], how='right')

vehicle_columns = ['bike', 'bus', 'car', 'train', 'walk']
df[vehicle_columns] = df[vehicle_columns].fillna(0)
df[vehicle_columns] = df[vehicle_columns].astype(int)

df

Unnamed: 0,playerId,startTime,bike,bus,car,train,walk,counterName,target,periodTarget,state
0,u_0bea6988-bd00-4aa6-a456-4285744356ee,2023-04-23,0,0,0,0,0,Walk_Km,1,0,COMPLETED
1,u_0bea6988-bd00-4aa6-a456-4285744356ee,2023-04-23,0,0,0,0,0,green_leaves,30,2,COMPLETED
2,u_1636dfdc-fbcc-4068-8fcd-3293369c3a82,2023-04-23,0,0,0,0,0,Walk_Km,1,0,COMPLETED
3,u_1636dfdc-fbcc-4068-8fcd-3293369c3a82,2023-04-23,0,0,0,0,0,green_leaves,30,2,COMPLETED
4,u_2fe7aac8-07da-4d38-8b0a-978be1986ebf,2023-04-23,0,0,0,0,0,Walk_Km,1,0,COMPLETED
...,...,...,...,...,...,...,...,...,...,...,...
6011,u_f4b95ec9-5f53-48d8-8adf-fa61be00c967,2023-09-17,106,0,0,0,6,Bike_Km,1,0,COMPLETED
6012,u_f4b95ec9-5f53-48d8-8adf-fa61be00c967,2023-09-17,106,0,0,0,6,green_leaves,3,5,COMPLETED
6013,u_f6a7cd70958e448f829591bbf6a90ec8,2023-09-17,22,0,0,0,0,green_leaves,30,2,COMPLETED
6014,u_f9994c4795f34970addeb5d3ca8ed1ab,2023-09-17,41,0,0,0,0,green_leaves,45,2,COMPLETED


In [12]:
df.to_parquet('../../out/parquet/raw.parquet')