<a href="https://colab.research.google.com/github/semishen/ML100Days/blob/master/Day_028_HW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 使用並觀察特徵組合, 在計程車費率預測競賽的影響

# [作業重點]
- 仿造範例並參考今日課程內容, 使用經緯度一圈的長度比的概念造出新特徵, 觀察有什麼影響 (In[6], Out[6])
- 只使用上面所造的這個新特徵, 觀察有什麼影響 (In[7], Out[7])

In [1]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

df = pd.read_csv('taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [2]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [3]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df1 = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df1)
Linear = LinearRegression()
GDBT = GradientBoostingRegressor()

linear_reg_score = cross_val_score(Linear, train_X, train_Y, cv=5).mean()
GDBT_score = cross_val_score(GDBT, train_X, train_Y, cv=5).mean()

print('=== baseline ===')
print('Linear Reg Score: ', linear_reg_score)
print('Gradient Boosting Reg Score: ', GDBT_score)

=== baseline ===
Linear Reg Score:  0.026876871475639042
Gradient Boosting Reg Score:  0.7099614156492018


In [4]:
# 增加緯度差, 經度差, 座標距離等三個特徵
df['longitude_diff'] = df['dropoff_longitude'] - df['pickup_longitude']
df['latitude_diff'] = df['dropoff_latitude'] - df['pickup_latitude']
df['distance_2D'] = (df['longitude_diff']**2 + df['latitude_diff']**2)**0.5
df[['distance_2D', 'longitude_diff', 'latitude_diff', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].head()

Unnamed: 0,distance_2D,longitude_diff,latitude_diff,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.009761,0.009452,-0.002437,-73.99058,40.761071,-73.981128,40.758634
1,0.018307,-0.001244,0.018265,-73.988403,40.723431,-73.989647,40.741695
2,0.00814,0.003756,-0.007222,-74.015785,40.71511,-74.012029,40.707888
3,0.021056,0.019292,-0.008437,-73.977322,40.787275,-73.95803,40.778838
4,0.032964,0.007193,0.03217,-73.989683,40.729717,-73.98249,40.761887


In [7]:
# 結果 : 準確度上升
df2 = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df2)

linear_reg_score = cross_val_score(Linear, train_X, train_Y, cv=5).mean()
GDBT_score = cross_val_score(GDBT, train_X, train_Y, cv=5).mean()

print('=== coordinate diff ===')
print('Linear Reg Score: ', linear_reg_score)
print('Gradient Boosting Reg Score: ', GDBT_score)

=== coordinate diff ===
Linear Reg Score:  0.027388172930510456
Gradient Boosting Reg Score:  0.8053065406310761


# 作業1
### Q1: 參考今日教材，試著使用經緯度一圈的長度比這一概念，組合出一個新特徵，再觀察原特徵加上新特徵是否提升了正確率?
### A1: 增加了 longitude_diff_modified 此特徵並沒有提升 model 的解釋力。

In [8]:
import math
# lon : lat = 0.75756 : 1
df['longitude_diff_modified'] = df['longitude_diff'] * 0.75756
# df['distance_2D_modified'] = (df['longitude_diff_modified']**2 + df['latitude_diff']**2)**0.5
# df[['distance_2D_modified','distance_2D', 'longitude_diff_modified', 'longitude_diff', 'latitude_diff']].head()


# 觀察結果
df3 = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df3)

linear_reg_score = cross_val_score(Linear, train_X, train_Y, cv=5).mean()
GDBT_score = cross_val_score(GDBT, train_X, train_Y, cv=5).mean()

print('=== longitude_diff_modified ===')
print('Linear Reg Score: ', linear_reg_score)
print('Gradient Boosting Reg Score: ', GDBT_score)

=== longitude_diff_modified ===
Linear Reg Score:  0.027387362127909376
Gradient Boosting Reg Score:  0.804827134427688


# 作業2
### Q2: 試著只使用新特徵估計目標值(忽略原特徵)，效果跟作業1的結果比較起來效果如何?
### A2: 雖然單以 real distnce 來建立 model 的效果比作業一差，不過此特徵有強大的解釋效果。

In [10]:
df['distance_real'] = (df['longitude_diff_modified']**2 + df['latitude_diff']**2)**0.5
df4 = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df4[['distance_real']])

linear_reg_score = cross_val_score(Linear, train_X, train_Y, cv=5).mean()
GDBT_score = cross_val_score(GDBT, train_X, train_Y, cv=5).mean()

print('=== real distance ===')
print('Linear Reg Score: ', linear_reg_score)
print('Gradient Boosting Reg Score: ', GDBT_score)

=== real distance ===
Linear Reg Score:  0.0014467562845988046
Gradient Boosting Reg Score:  0.7221194640622727


In [16]:
df.head()
drop_list = ['longitude_diff', 'distance_2D', 'pickup_datetime' ]
df5 = df.drop(drop_list , axis=1)
train_X = scaler.fit_transform(df5)

linear_reg_score = cross_val_score(Linear, train_X, train_Y, cv=5).mean()
GDBT_score = cross_val_score(GDBT, train_X, train_Y, cv=5).mean()

print('=== real distance ===')
print('Linear Reg Score: ', linear_reg_score)
print('Gradient Boosting Reg Score: ', GDBT_score)

=== real distance ===
Linear Reg Score:  0.027523613273612257
Gradient Boosting Reg Score:  0.8052883662348984


In [17]:
df5.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,latitude_diff,longitude_diff_modified,distance_real
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,-0.002437,0.00716,0.007564
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,0.018265,-0.000942,0.018289
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,-0.007222,0.002845,0.007762
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,-0.008437,0.014615,0.016875
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,0.03217,0.005449,0.032628
