# データ整形

学習に必要なデータセット整形と、必要な前処理を検討する

EDAからわかったことから特徴量を選択。（ageはCustomerでNoneになるので除外）

| **カテゴリ** | **特徴量名** | **型** | **説明** | **注意点** |
| --- | --- | --- | --- | --- |
|  時間特徴 | start_hour | category(int) | 利用開始時間（朝・昼・夜を0,1,2にカテゴリ化） | 通勤／観光など時間傾向を表す |
| | weekday | category(int) | 曜日（月=0〜日=6） | 週末・平日差の傾向を表す |  |
|  行動特徴 | tripduration_min | float | 利用時間（分）。360分以上は外れ値として除外 | 行動強度のproxy |
|  地理特徴 | station_usage_count | float | 出発駅の利用頻度（人気度） | 329駅→1数値へ圧縮 |
|  自転車特徴 | bike_usage_count | float | 各bike_idの利用回数 | 稼働率のproxy |
|  属性特徴 | gender | category | 0=不明, 1=男性, 2=女性 | Customerの多くは0。不均衡でもOK |
|  目的変数 | is_member | int | Subscriber=1, Customer=0 | ターゲットラベル |



## 前処理関数検討


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from enum import Enum, auto

# 表示設定
pd.set_option("display.max_columns", None)
plt.style.use("seaborn-v0_8")

PROCESSED_DIR = Path("/app/data/processed")
PROCESSED_DIR.mkdir(exist_ok=True, parents=True)

In [3]:
import sys
sys.path.append('../..')

from src.utils.io import load_month_data

In [3]:
df_org = load_month_data(2014, 1)

print("Shape: ", df_org.shape)
df_org.head()

Loading: [PosixPath('/app/data/raw/2014-citibike-tripdata/1_January/201401-citibike-tripdata_1.csv')]
Shape:  (300400, 15)


Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,471,2014-01-01 00:00:06,2014-01-01 00:07:57,2009,Catherine St & Monroe St,40.711174,-73.996826,263,Elizabeth St & Hester St,40.71729,-73.996375,16379,Subscriber,1986,1
1,1494,2014-01-01 00:00:38,2014-01-01 00:25:32,536,1 Ave & E 30 St,40.741444,-73.975361,259,South St & Whitehall St,40.701221,-74.012342,15611,Subscriber,1963,1
2,464,2014-01-01 00:03:59,2014-01-01 00:11:43,228,E 48 St & 3 Ave,40.754601,-73.971879,2022,E 59 St & Sutton Pl,40.758491,-73.959206,16613,Subscriber,1991,1
3,373,2014-01-01 00:05:15,2014-01-01 00:11:28,519,Pershing Square N,40.751884,-73.977702,526,E 33 St & 5 Ave,40.747659,-73.984907,15938,Subscriber,1989,1
4,660,2014-01-01 00:05:18,2014-01-01 00:16:18,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,436,Hancock St & Bedford Ave,40.682166,-73.95399,19830,Subscriber,1990,1


In [15]:
df = df_org.copy()

# 日付、時間の整形
df["starttime"] = pd.to_datetime(df["starttime"])
df["stoptime"] = pd.to_datetime(df["stoptime"])

df["tripduration_min"] = df["tripduration"] / 60
df = df[df["tripduration_min"] < 360]   # 外れ値処理

# 0：月曜日はじまり
df["start_hour"] = df["starttime"].dt.hour
df["weekday"] = df["starttime"].dt.weekday

df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,tripduration_min,start_hour,weekday
0,471,2014-01-01 00:00:06,2014-01-01 00:07:57,2009,Catherine St & Monroe St,40.711174,-73.996826,263,Elizabeth St & Hester St,40.71729,-73.996375,16379,Subscriber,1986,1,7.85,0,2
1,1494,2014-01-01 00:00:38,2014-01-01 00:25:32,536,1 Ave & E 30 St,40.741444,-73.975361,259,South St & Whitehall St,40.701221,-74.012342,15611,Subscriber,1963,1,24.9,0,2
2,464,2014-01-01 00:03:59,2014-01-01 00:11:43,228,E 48 St & 3 Ave,40.754601,-73.971879,2022,E 59 St & Sutton Pl,40.758491,-73.959206,16613,Subscriber,1991,1,7.733333,0,2
3,373,2014-01-01 00:05:15,2014-01-01 00:11:28,519,Pershing Square N,40.751884,-73.977702,526,E 33 St & 5 Ave,40.747659,-73.984907,15938,Subscriber,1989,1,6.216667,0,2
4,660,2014-01-01 00:05:18,2014-01-01 00:16:18,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,436,Hancock St & Bedford Ave,40.682166,-73.95399,19830,Subscriber,1990,1,11.0,0,2


In [17]:
class TimeOfDay(Enum):
    MORNING = auto()
    AFTERNOON = auto()
    NIGHT = auto()
    
def categorize_time(hour):
    if 6 <= hour < 12:
        return TimeOfDay.MORNING.value
    elif 12 <= hour < 18:
        return TimeOfDay.AFTERNOON.value
    else:
        return TimeOfDay.NIGHT.value

df["time_category"] = df["start_hour"].apply(categorize_time) # type: ignore

df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,tripduration_min,start_hour,weekday,time_category
0,471,2014-01-01 00:00:06,2014-01-01 00:07:57,2009,Catherine St & Monroe St,40.711174,-73.996826,263,Elizabeth St & Hester St,40.71729,-73.996375,16379,Subscriber,1986,1,7.85,0,2,3
1,1494,2014-01-01 00:00:38,2014-01-01 00:25:32,536,1 Ave & E 30 St,40.741444,-73.975361,259,South St & Whitehall St,40.701221,-74.012342,15611,Subscriber,1963,1,24.9,0,2,3
2,464,2014-01-01 00:03:59,2014-01-01 00:11:43,228,E 48 St & 3 Ave,40.754601,-73.971879,2022,E 59 St & Sutton Pl,40.758491,-73.959206,16613,Subscriber,1991,1,7.733333,0,2,3
3,373,2014-01-01 00:05:15,2014-01-01 00:11:28,519,Pershing Square N,40.751884,-73.977702,526,E 33 St & 5 Ave,40.747659,-73.984907,15938,Subscriber,1989,1,6.216667,0,2,3
4,660,2014-01-01 00:05:18,2014-01-01 00:16:18,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,436,Hancock St & Bedford Ave,40.682166,-73.95399,19830,Subscriber,1990,1,11.0,0,2,3


In [18]:
# 駅の人気度
station_usage = df.groupby("start station name").size().reset_index(name="station_usage_count")
df = df.merge(station_usage, on="start station name", how="left")

# 自転車の利用回数
bike_usage = df.groupby("bikeid").size().reset_index(name="bike_usage_count")
df = df.merge(bike_usage, on="bikeid", how="left")

In [23]:
# 目的変数
df["is_member"] = (df["usertype"] == "Subscriber").astype(int)

In [24]:
features = [
    "start_hour",
    "weekday",
    "time_category",
    "tripduration_min",
    "station_usage_count",
    "bike_usage_count",
    "gender"
]

df_final = df[features + ["is_member"]]
print("Final shape:", df_final.shape)
display(df_final.head())

Final shape: (300097, 8)


Unnamed: 0,start_hour,weekday,time_category,tripduration_min,station_usage_count,bike_usage_count,gender,is_member
0,0,2,3,7.85,460,45,1,1
1,0,2,3,24.9,1193,36,1,1
2,0,2,3,7.733333,951,34,1,1
3,0,2,3,6.216667,3633,61,1,1
4,0,2,3,11.0,444,38,1,1


In [25]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300097 entries, 0 to 300096
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   start_hour           300097 non-null  int32  
 1   weekday              300097 non-null  int32  
 2   time_category        300097 non-null  int64  
 3   tripduration_min     300097 non-null  float64
 4   station_usage_count  300097 non-null  int64  
 5   bike_usage_count     300097 non-null  int64  
 6   gender               300097 non-null  int64  
 7   is_member            300097 non-null  int64  
dtypes: float64(1), int32(2), int64(5)
memory usage: 16.0 MB


※ **データの妥当性について簡単にチェックする関数をつけておくとよい**

In [4]:
from src.utils.preprocess import preprocess_pipeline

In [5]:
df_org = load_month_data(2014, 1)

print("Shape: ", df_org.shape)
df_org.head()

Loading: [PosixPath('/app/data/raw/2014-citibike-tripdata/1_January/201401-citibike-tripdata_1.csv')]
Shape:  (300400, 15)


Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,471,2014-01-01 00:00:06,2014-01-01 00:07:57,2009,Catherine St & Monroe St,40.711174,-73.996826,263,Elizabeth St & Hester St,40.71729,-73.996375,16379,Subscriber,1986,1
1,1494,2014-01-01 00:00:38,2014-01-01 00:25:32,536,1 Ave & E 30 St,40.741444,-73.975361,259,South St & Whitehall St,40.701221,-74.012342,15611,Subscriber,1963,1
2,464,2014-01-01 00:03:59,2014-01-01 00:11:43,228,E 48 St & 3 Ave,40.754601,-73.971879,2022,E 59 St & Sutton Pl,40.758491,-73.959206,16613,Subscriber,1991,1
3,373,2014-01-01 00:05:15,2014-01-01 00:11:28,519,Pershing Square N,40.751884,-73.977702,526,E 33 St & 5 Ave,40.747659,-73.984907,15938,Subscriber,1989,1
4,660,2014-01-01 00:05:18,2014-01-01 00:16:18,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,436,Hancock St & Bedford Ave,40.682166,-73.95399,19830,Subscriber,1990,1


In [6]:
df_test = preprocess_pipeline(df_org)

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300097 entries, 0 to 300096
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   start_hour           300097 non-null  int32  
 1   weekday              300097 non-null  int32  
 2   time_category        300097 non-null  int64  
 3   tripduration_min     300097 non-null  float64
 4   station_usage_count  300097 non-null  int64  
 5   bike_usage_count     300097 non-null  int64  
 6   gender               300097 non-null  int64  
 7   is_member            300097 non-null  int64  
dtypes: float64(1), int32(2), int64(5)
memory usage: 16.0 MB


In [7]:
df_test.head()

Unnamed: 0,start_hour,weekday,time_category,tripduration_min,station_usage_count,bike_usage_count,gender,is_member
0,0,2,3,7.85,460,45,1,1
1,0,2,3,24.9,1193,36,1,1
2,0,2,3,7.733333,951,34,1,1
3,0,2,3,6.216667,3633,61,1,1
4,0,2,3,11.0,444,38,1,1
