In [6]:
from tqdm.notebook import tqdm
import pandas as pd

tqdm.pandas()

In [7]:
df = pd.read_parquet('../../out/parquet/raw.parquet')
df

Unnamed: 0,playerId,startTime,bike,bus,car,train,walk,counterName,target,periodTarget,state
0,u_0bea6988-bd00-4aa6-a456-4285744356ee,2023-04-23,0,0,0,0,0,Walk_Km,1,0,COMPLETED
1,u_0bea6988-bd00-4aa6-a456-4285744356ee,2023-04-23,0,0,0,0,0,green_leaves,30,2,COMPLETED
2,u_1636dfdc-fbcc-4068-8fcd-3293369c3a82,2023-04-23,0,0,0,0,0,Walk_Km,1,0,COMPLETED
3,u_1636dfdc-fbcc-4068-8fcd-3293369c3a82,2023-04-23,0,0,0,0,0,green_leaves,30,2,COMPLETED
4,u_2fe7aac8-07da-4d38-8b0a-978be1986ebf,2023-04-23,0,0,0,0,0,Walk_Km,1,0,COMPLETED
...,...,...,...,...,...,...,...,...,...,...,...
6011,u_f4b95ec9-5f53-48d8-8adf-fa61be00c967,2023-09-17,106,0,0,0,6,Bike_Km,1,0,COMPLETED
6012,u_f4b95ec9-5f53-48d8-8adf-fa61be00c967,2023-09-17,106,0,0,0,6,green_leaves,3,5,COMPLETED
6013,u_f6a7cd70958e448f829591bbf6a90ec8,2023-09-17,22,0,0,0,0,green_leaves,30,2,COMPLETED
6014,u_f9994c4795f34970addeb5d3ca8ed1ab,2023-09-17,41,0,0,0,0,green_leaves,45,2,COMPLETED


In [8]:
def challenge_to_text(counterName, target, periodTarget):
    d = {
        "Walk_Km": f"Walk at least {target} Km",
        "Bike_Km": f"Bike at least {target} Km",
        "green_leaves": f"Collect at least {target} points",
    }
    pt = f" per day for {periodTarget} days a week" if periodTarget > 1 else " during the week"

    return d[counterName] + pt

def row_to_text(row, y='<M>'):
    return f"On {row['startTime'].strftime('%B %d %Y')} the value of \"{challenge_to_text(row['counterName'], row['target'], row['periodTarget'])}\" is {y}."

def prompts(x: pd.DataFrame, last_week: pd.Timestamp):
    to_predict = x[x['startTime'] == last_week]
    if to_predict.empty:
        return pd.DataFrame()
    
    to_sequence = x[x['startTime'] < last_week].sort_values('startTime')
    history = '\n'.join([
        row_to_text(row, row['state'])
        for _, row in to_sequence.iterrows()
    ]) + '\n' if not to_sequence.empty else ''
    prompts = [
        f"{history}{row_to_text(row)}"
        for _, row in to_predict.iterrows()
    ]

    return pd.DataFrame({'X': prompts, 'y': to_predict['state'], 'cut': last_week})

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")

lens = [2, 3, 4, 5]
for l in lens:
    windows = list(pd.Series(df['startTime'].sort_values().unique()).rolling(l))

    prompts_df = pd.concat([
        df[df['startTime'].isin(w)].groupby('playerId').apply(prompts, include_groups=False, last_week=max(w))
        for w in tqdm(windows)
    ]).reset_index(drop=True)
    
    prompts_df['token'] = prompts_df['X'].progress_apply(lambda x: len(tokenizer.tokenize(x)))
    
    if prompts_df['token'].max() > 512:
        print(f"There are prompts with more than 512 tokens for l={l}!")

    prompts_df[['X', 'y', 'cut']].to_parquet(f'../../out/parquet/prompts_{l}.parquet')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on google-t5/t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/6016 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/6016 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/6016 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/6016 [00:00<?, ?it/s]

In [10]:
prompts_df['X'].sample(5).values

array(['On May 21 2023 the value of "Walk at least 1 Km during the week" is COMPLETED.\nOn May 21 2023 the value of "Collect at least 14 points per day for 3 days a week" is COMPLETED.\nOn May 28 2023 the value of "Walk at least 1 Km during the week" is COMPLETED.\nOn May 28 2023 the value of "Collect at least 13 points per day for 3 days a week" is COMPLETED.\nOn June 04 2023 the value of "Walk at least 1 Km during the week" is FAILED.\nOn June 04 2023 the value of "Collect at least 8 points per day for 3 days a week" is COMPLETED.\nOn June 11 2023 the value of "Walk at least 1 Km during the week" is <M>.',
       'On June 18 2023 the value of "Walk at least 1 Km during the week" is COMPLETED.\nOn June 18 2023 the value of "Collect at least 2 points per day for 6 days a week" is FAILED.\nOn June 25 2023 the value of "Walk at least 1 Km during the week" is COMPLETED.\nOn June 25 2023 the value of "Collect at least 45 points per day for 2 days a week" is COMPLETED.\nOn July 02 2023 the 