# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction
***
- 使用程車費率預測競賽練習時間欄位處理

In [45]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [46]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = pd.DatetimeIndex(df['pickup_datetime'])
df['pickup_year'] = pd.DatetimeIndex(df['pickup_datetime']).year
df['pickup_month'] = pd.DatetimeIndex(df['pickup_datetime']).month
df['pickup_day'] = pd.DatetimeIndex(df['pickup_datetime']).day
df['pickup_hour'] = pd.DatetimeIndex(df['pickup_datetime']).hour
df['pickup_minute'] = pd.DatetimeIndex(df['pickup_datetime']).minute
df['pickup_second'] = pd.DatetimeIndex(df['pickup_datetime']).second
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [47]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.026876871475640864
Gradient Boosting Reg Score : 0.7100155617436028


# 作業1
* 對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵，  
看看結果會比原本只有時間特徵分解的結果更好或更差?

In [48]:
# 加入星期幾與第幾周兩個特徵
df_HW1 = df.drop(['pickup_datetime'] , axis=1)
df_HW1['dow'] = pd.DatetimeIndex(df['pickup_datetime']).dayofweek
df_HW1['woy'] = pd.DatetimeIndex(df['pickup_datetime']).weekofyear
df_HW1.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,dow,woy
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,6
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,24
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,24


In [49]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_HW1)
Linear = LinearRegression()
print(f'+DOW&WOY --> Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'+DOW&WOY --> Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


+DOW&WOY --> Linear Reg Score : 0.026481023863710318
+DOW&WOY --> Gradient Boosting Reg Score : 0.7087621205729754


In [50]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math
df_HW1['day_cycle'] = df_HW1['pickup_hour']/12 + df_HW1['pickup_minute']/720 + df_HW1['pickup_second']/43200
df_HW1['day_cycle'] = df_HW1['day_cycle'].map(lambda x:math.sin(x*math.pi))
df_HW1.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,dow,woy,day_cycle
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42,-0.02545
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,6,0.333601
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11,-0.967083
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,24,-0.888817
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,24,0.782427


In [51]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
#df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_HW1)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.026080352293789556
Gradient Boosting Reg Score : 0.7168186393290502


# 作業2
* 對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)，  
看看結果會比範例中的結果更好或更差?

In [96]:
# 加上"年週期"與"周週期"特徵
import math
df_HW2 = df.drop(['pickup_datetime'] , axis=1)
df_HW2['dow'] = pd.DatetimeIndex(df['pickup_datetime']).dayofweek
df_HW2['woy'] = pd.DatetimeIndex(df['pickup_datetime']).weekofyear
df_HW2['doy'] = pd.DatetimeIndex(df['pickup_datetime']).dayofyear
df_HW2['week_cycle'] = df_HW2['dow']/7*2 + df_HW2['pickup_hour']/(7*24)*2
df_HW2['week_cycle'] = df_HW2['week_cycle'].map(lambda x:math.sin(x*math.pi))
df_HW2['year_cycle'] = (df_HW2['doy'])/365*2
df_HW2['year_cycle'] = df_HW2['year_cycle'].map(lambda x:math.cos(x*math.pi))
df_HW2.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,dow,woy,doy,week_cycle,year_cycle
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42,294,-0.965926,0.341571
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,6,34,0.955573,0.833556
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11,75,-0.222521,0.276097
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,24,164,-0.930874,-0.949718
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,24,163,0.330279,-0.944188


In [97]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
train_X = scaler.fit_transform(df_HW2)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.02617536482720284
Gradient Boosting Reg Score : 0.7122924541953292


In [169]:
# 加上"年週期"與"周週期"特徵
import math
df_HW2 = df.drop(['pickup_datetime'] , axis=1)
df_HW2['dow'] = pd.DatetimeIndex(df['pickup_datetime']).dayofweek
df_HW2['woy'] = pd.DatetimeIndex(df['pickup_datetime']).weekofyear
df_HW2['doy'] = pd.DatetimeIndex(df['pickup_datetime']).dayofyear
df_HW2['week_cycle'] = df_HW2['dow']/7*2 + df_HW2['pickup_hour']/(7*24)*2
df_HW2['week_cycle'] = df_HW2['week_cycle'].map(lambda x:math.sin(x*math.pi))
df_HW2['t1'] = (df_HW2['pickup_month']-5)/12*2
df_HW2['year_cycle'] = df_HW2['t1'].map(lambda x:math.cos(x*math.pi))
df_HW2.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,dow,woy,doy,week_cycle,t1,year_cycle
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42,294,-0.965926,0.833333,-0.8660254
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,6,34,0.955573,-0.5,6.123234000000001e-17
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11,75,-0.222521,-0.333333,0.5
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,24,164,-0.930874,0.166667,0.8660254
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,24,163,0.330279,0.166667,0.8660254


In [107]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
train_X = scaler.fit_transform(df_HW2)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.025856900529990588
Gradient Boosting Reg Score : 0.71990010144205
