In [23]:
import pandas as pd
import numpy as np

## Load Data


In [24]:
df = pd.read_csv('my_data/data/humo_data/training_set.csv')

df.head()

Unnamed: 0,uid,d,t,x,y
0,16,0,7,99,100
1,16,0,9,99,104
2,16,0,10,104,118
3,16,0,11,166,135
4,16,0,12,162,172


In [25]:
df['categorical_id'] = df['uid']

## Timestamp Building

In [26]:
# 使用参考日期
reference_date = pd.Timestamp("2023-01-01")

# 将“d”和“t”转化为时间戳
df["timestamp"] = reference_date + pd.to_timedelta(df["d"], unit='D') + pd.to_timedelta(df["t"]/2, unit='h')

df.head()

Unnamed: 0,uid,d,t,x,y,categorical_id,timestamp
0,16,0,7,99,100,16,2023-01-01 03:30:00
1,16,0,9,99,104,16,2023-01-01 04:30:00
2,16,0,10,104,118,16,2023-01-01 05:00:00
3,16,0,11,166,135,16,2023-01-01 05:30:00
4,16,0,12,162,172,16,2023-01-01 06:00:00


## LocationID

In [27]:
# Calculate the unique identifier for each pair of (x, y)
df['location_id'] = (
    df['x'] - df['x'].min()) * (
    df['y'].max() - df['y'].min() + 1) + (
    df['y'] - df['y'].min()
    )+ 1

df.head()

Unnamed: 0,uid,d,t,x,y,categorical_id,timestamp,location_id
0,16,0,7,99,100,16,2023-01-01 03:30:00,15890
1,16,0,9,99,104,16,2023-01-01 04:30:00,15894
2,16,0,10,104,118,16,2023-01-01 05:00:00,16883
3,16,0,11,166,135,16,2023-01-01 05:30:00,28990
4,16,0,12,162,172,16,2023-01-01 06:00:00,28247


## Filling Empty Time Points

In [28]:
# 创建一个时间范围，从最早到最晚，每半小时一个时间戳
date_range = pd.date_range(start=df['timestamp'].min().floor('D'), 
                          end=df['timestamp'].max().ceil('D'),
                          freq='30T')

uids = df['uid'].unique()

all_data = []

for uid in uids:
    # 为每个用户创建一个新的DataFrame，包括所有的时间戳
    temp_df = pd.DataFrame({
        'timestamp': date_range,
        'uid': uid
    })
    
    # 使用原始数据来填充这个DataFrame
    merged = pd.merge(temp_df, df[df['uid'] == uid], on=['timestamp', 'uid'], how='left')

    # 为 't' 列生成 0-47 的范围
    merged['t'] = merged['timestamp'].dt.hour * 2 + merged['timestamp'].dt.minute // 30
    
    # 前向填充
    merged = merged.fillna(method='ffill')
    
    # 后向填充
    merged = merged.fillna(method='bfill')
    
    all_data.append(merged)

# 将所有用户的数据合并到一个大的DataFrame中
result = pd.concat(all_data)

result.head()

  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')
  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')
  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')
  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')
  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')
  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')
  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')
  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')
  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')
  merged = merged.fillna(method='ffill')
  merged = merged.fillna(method='bfill')


Unnamed: 0,timestamp,uid,d,t,x,y,categorical_id,location_id
0,2023-01-01 00:00:00,16,0.0,0,99.0,100.0,16.0,15890.0
1,2023-01-01 00:30:00,16,0.0,1,99.0,100.0,16.0,15890.0
2,2023-01-01 01:00:00,16,0.0,2,99.0,100.0,16.0,15890.0
3,2023-01-01 01:30:00,16,0.0,3,99.0,100.0,16.0,15890.0
4,2023-01-01 02:00:00,16,0.0,4,99.0,100.0,16.0,15890.0


In [29]:
print(result)

               timestamp  uid     d   t      x      y  categorical_id  \
0    2023-01-01 00:00:00   16   0.0   0   99.0  100.0            16.0   
1    2023-01-01 00:30:00   16   0.0   1   99.0  100.0            16.0   
2    2023-01-01 01:00:00   16   0.0   2   99.0  100.0            16.0   
3    2023-01-01 01:30:00   16   0.0   3   99.0  100.0            16.0   
4    2023-01-01 02:00:00   16   0.0   4   99.0  100.0            16.0   
...                  ...  ...   ...  ..    ...    ...             ...   
3596 2023-03-16 22:00:00   25  74.0  44  139.0   87.0            25.0   
3597 2023-03-16 22:30:00   25  74.0  45  139.0   87.0            25.0   
3598 2023-03-16 23:00:00   25  74.0  46  139.0   87.0            25.0   
3599 2023-03-16 23:30:00   25  74.0  47  139.0   87.0            25.0   
3600 2023-03-17 00:00:00   25  74.0   0  139.0   87.0            25.0   

      location_id  
0         15890.0  
1         15890.0  
2         15890.0  
3         15890.0  
4         15890.0  
...

### Result Checking

In [30]:

# 计算预期的时间点数
expected_count = len(date_range)

# 对每个uid进行分组并计算条数
grouped_counts = result.groupby('uid').size()

# 查找不匹配预期的uid
incomplete_uids = grouped_counts[grouped_counts != expected_count].index.tolist()

if not incomplete_uids:
    print("所有uid都有完整的时间点记录。")
else:
    print(f"以下uid的记录不完整：{incomplete_uids}")


所有uid都有完整的时间点记录。


In [31]:
result.to_csv('my_data/data/humo_data/processed_train_set.csv', index=False)