In [32]:
import pandas as pd
import numpy as np

## Load Data


In [78]:
df = pd.read_csv('my_data/data/humob/training_set.csv')

df.head()

Unnamed: 0,uid,d,t,x,y
0,16,0,7,99,100
1,16,0,9,99,104
2,16,0,10,104,118
3,16,0,11,166,135
4,16,0,12,162,172


In [79]:
df['categorical_id'] = df['uid']

## Timestamp Building

In [80]:
# 使用参考日期
reference_date = pd.Timestamp("2023-01-01")

# 将“d”和“t”转化为时间戳
df["timestamp"] = reference_date + pd.to_timedelta(df["d"], unit='D') + pd.to_timedelta(df["t"]/2, unit='h')

df.head()

Unnamed: 0,uid,d,t,x,y,categorical_id,timestamp
0,16,0,7,99,100,16,2023-01-01 03:30:00
1,16,0,9,99,104,16,2023-01-01 04:30:00
2,16,0,10,104,118,16,2023-01-01 05:00:00
3,16,0,11,166,135,16,2023-01-01 05:30:00
4,16,0,12,162,172,16,2023-01-01 06:00:00


## LocationID

In [81]:
# Calculate the unique identifier for each pair of (x, y)
df['location_id'] = (
    df['x'] - df['x'].min()) * (
    df['y'].max() - df['y'].min() + 1) + (
    df['y'] - df['y'].min()
    )+ 1

df.head()

Unnamed: 0,uid,d,t,x,y,categorical_id,timestamp,location_id
0,16,0,7,99,100,16,2023-01-01 03:30:00,15890
1,16,0,9,99,104,16,2023-01-01 04:30:00,15894
2,16,0,10,104,118,16,2023-01-01 05:00:00,16883
3,16,0,11,166,135,16,2023-01-01 05:30:00,28990
4,16,0,12,162,172,16,2023-01-01 06:00:00,28247


In [82]:
# Generating the full dataframe
unique_uids = df['uid'].unique()
days = range(75)
times = range(48)

index = pd.MultiIndex.from_product([unique_uids, days, times], names=['uid', 'd', 't'])
df_filled = pd.DataFrame(index=index).reset_index()

# Merging with the original dataframe
df_filled = df_filled.merge(df, on=['uid', 'd', 't'], how='left')

# Filling the timestamp and categorical_id columns
df_filled['timestamp'] = pd.to_datetime('2023-01-01') + pd.to_timedelta(df_filled['d'], unit='D') + pd.to_timedelta(df_filled['t']*30, unit='m')
df_filled['categorical_id'] = df_filled['uid']

df_filled.head(10)

Unnamed: 0,uid,d,t,x,y,categorical_id,timestamp,location_id
0,16,0,0,,,16,2023-01-01 00:00:00,
1,16,0,1,,,16,2023-01-01 00:30:00,
2,16,0,2,,,16,2023-01-01 01:00:00,
3,16,0,3,,,16,2023-01-01 01:30:00,
4,16,0,4,,,16,2023-01-01 02:00:00,
5,16,0,5,,,16,2023-01-01 02:30:00,
6,16,0,6,,,16,2023-01-01 03:00:00,
7,16,0,7,99.0,100.0,16,2023-01-01 03:30:00,15890.0
8,16,0,8,,,16,2023-01-01 04:00:00,
9,16,0,9,99.0,104.0,16,2023-01-01 04:30:00,15894.0


## Filling Empty Time Points

In [85]:
# Create a copy of the data to avoid modifying the original dataframe
df_sample_reset = df_filled.copy()

df_sorted = df_sample_reset.sort_values(by=['uid', 'd', 't']).reset_index(drop=True)

# Define a function to fill the NA values for each uid and d combination
def fill_values_for_group(group):
    # Scenario 1: Fill NA values between two non-NA t values with forward fill
    group[['x', 'y', 'location_id']] = group[['x', 'y', 'location_id']].ffill()

    # Scenario 2 and 3: Fill NA values before or after a non-NA t value with backward fill
    group[['x', 'y', 'location_id']] = group[['x', 'y', 'location_id']].bfill()

    return group

# Apply the function to each uid and d combination
df_filled_correctly = df_sorted.groupby(['uid', 'd']).apply(fill_values_for_group).reset_index(drop=True)

df_filled_correctly.head(10)


Unnamed: 0,uid,d,t,x,y,categorical_id,timestamp,location_id
0,16,0,0,99.0,100.0,16,2023-01-01 00:00:00,15890.0
1,16,0,1,99.0,100.0,16,2023-01-01 00:30:00,15890.0
2,16,0,2,99.0,100.0,16,2023-01-01 01:00:00,15890.0
3,16,0,3,99.0,100.0,16,2023-01-01 01:30:00,15890.0
4,16,0,4,99.0,100.0,16,2023-01-01 02:00:00,15890.0
5,16,0,5,99.0,100.0,16,2023-01-01 02:30:00,15890.0
6,16,0,6,99.0,100.0,16,2023-01-01 03:00:00,15890.0
7,16,0,7,99.0,100.0,16,2023-01-01 03:30:00,15890.0
8,16,0,8,99.0,100.0,16,2023-01-01 04:00:00,15890.0
9,16,0,9,99.0,104.0,16,2023-01-01 04:30:00,15894.0


### Result Checking

In [86]:
grouped = df_filled_correctly.groupby(['uid', 'd']).size().reset_index(name='count')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print(grouped)


     uid   d  count
0     16   0     48
1     16   1     48
2     16   2     48
3     16   3     48
4     16   4     48
5     16   5     48
6     16   6     48
7     16   7     48
8     16   8     48
9     16   9     48
10    16  10     48
11    16  11     48
12    16  12     48
13    16  13     48
14    16  14     48
15    16  15     48
16    16  16     48
17    16  17     48
18    16  18     48
19    16  19     48
20    16  20     48
21    16  21     48
22    16  22     48
23    16  23     48
24    16  24     48
25    16  25     48
26    16  26     48
27    16  27     48
28    16  28     48
29    16  29     48
30    16  30     48
31    16  31     48
32    16  32     48
33    16  33     48
34    16  34     48
35    16  35     48
36    16  36     48
37    16  37     48
38    16  38     48
39    16  39     48
40    16  40     48
41    16  41     48
42    16  42     48
43    16  43     48
44    16  44     48
45    16  45     48
46    16  46     48
47    16  47     48
48    16  48     48


In [88]:
# Group by 'uid' and 'd' and count the number of unique 't' values for each group
grouped_counts = df_filled_correctly.groupby(['uid', 'd'])['t'].nunique().reset_index(name='count_t')

# Filter out any group that doesn't have 48 unique 't' values
incomplete_groups = grouped_counts[grouped_counts['count_t'] != 48]

incomplete_groups


Unnamed: 0,uid,d,count_t


In [91]:
df_filled_correctly.to_csv('my_data/data/humob/humob_data.csv', index=False)

## Checking train/test/valid dataset

In [92]:
test = pd.read_csv('test_transformed.csv')

test.head()

Unnamed: 0,d,t,x,y,categorical_id,timestamp,location_id
0,1.062419,0,129.0,70.0,0,2023-03-02 00:00:00,1267
1,1.062419,1,129.0,70.0,0,2023-03-02 00:30:00,1267
2,1.062419,2,129.0,70.0,0,2023-03-02 01:00:00,1267
3,1.062419,3,129.0,70.0,0,2023-03-02 01:30:00,1267
4,1.062419,4,129.0,70.0,0,2023-03-02 02:00:00,1267


In [96]:
train = pd.read_csv('train_transformed.csv')

train.head()

Unnamed: 0,d,t,x,y,categorical_id,timestamp,location_id
0,-1.709109,0,99.0,100.0,0,2023-01-01 00:00:00,584
1,-1.709109,1,99.0,100.0,0,2023-01-01 00:30:00,584
2,-1.709109,2,99.0,100.0,0,2023-01-01 01:00:00,584
3,-1.709109,3,99.0,100.0,0,2023-01-01 01:30:00,584
4,-1.709109,4,99.0,100.0,0,2023-01-01 02:00:00,584


In [99]:
valid = pd.read_csv('valid_transformed.csv')

valid.head()

Unnamed: 0,d,t,x,y,categorical_id,timestamp,location_id
0,0.739074,0,79.0,183.0,0,2023-02-23 00:00:00,154
1,0.739074,1,79.0,183.0,0,2023-02-23 00:30:00,154
2,0.739074,2,79.0,183.0,0,2023-02-23 01:00:00,154
3,0.739074,3,79.0,183.0,0,2023-02-23 01:30:00,154
4,0.739074,4,79.0,183.0,0,2023-02-23 02:00:00,154


In [100]:
grouped = valid.groupby(['categorical_id', 'd']).size().reset_index(name='count')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print(grouped)

    categorical_id         d  count
0                0  0.739074     48
1                0  0.785266     48
2                0  0.831458     48
3                0  0.877650     48
4                0  0.923843     48
5                0  0.970035     48
6                0  1.016227     48
7                1  0.739074     48
8                1  0.785266     48
9                1  0.831458     48
10               1  0.877650     48
11               1  0.923843     48
12               1  0.970035     48
13               1  1.016227     48
14               2  0.739074     48
15               2  0.785266     48
16               2  0.831458     48
17               2  0.877650     48
18               2  0.923843     48
19               2  0.970035     48
20               2  1.016227     48
21               3  0.739074     48
22               3  0.785266     48
23               3  0.831458     48
24               3  0.877650     48
25               3  0.923843     48
26               3  0.970035

In [101]:
# Group by 'uid' and 'd' and count the number of unique 't' values for each group
grouped_counts = valid.groupby(['categorical_id', 'd'])['t'].nunique().reset_index(name='count_t')

# Filter out any group that doesn't have 48 unique 't' values
incomplete_groups = grouped_counts[grouped_counts['count_t'] != 48]

incomplete_groups

Unnamed: 0,categorical_id,d,count_t
