# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 使用並觀察特徵組合, 在計程車費率預測競賽的影響

# [作業重點]
- 仿造範例並參考今日課程內容, 使用經緯度一圈的長度比的概念造出新特徵, 觀察有什麼影響 (In[6], Out[6])
- 只使用上面所造的這個新特徵, 觀察有什麼影響 (In[7], Out[7])

In [1]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [2]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [3]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.026876871475641616
Gradient Boosting Reg Score : 0.7107998357660751


In [4]:
# 增加緯度差, 經度差, 座標距離等三個特徵
df['longitude_diff'] = df['dropoff_longitude'] - df['pickup_longitude']
df['latitude_diff'] = df['dropoff_latitude'] - df['pickup_latitude']
df['distance_2D'] = (df['longitude_diff']**2 + df['latitude_diff']**2)**0.5
df[['distance_2D', 'longitude_diff', 'latitude_diff', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].head()

Unnamed: 0,distance_2D,longitude_diff,latitude_diff,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.009761,0.009452,-0.002437,-73.99058,40.761071,-73.981128,40.758634
1,0.018307,-0.001244,0.018265,-73.988403,40.723431,-73.989647,40.741695
2,0.00814,0.003756,-0.007222,-74.015785,40.71511,-74.012029,40.707888
3,0.021056,0.019292,-0.008437,-73.977322,40.787275,-73.95803,40.778838
4,0.032964,0.007193,0.03217,-73.989683,40.729717,-73.98249,40.761887


In [5]:
# 結果 : 準確度上升
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.027479693774541868
Gradient Boosting Reg Score : 0.8046908809458259


# 作業1
* 參考今日教材，試著使用經緯度一圈的長度比這一概念，組合出一個新特徵，再觀察原特徵加上新特徵是否提升了正確率?

In [6]:
df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,5000.0,5000.0,5000.0,5000.0
mean,-72.635174,39.981148,-72.619979,39.972626
std,10.01306,5.98976,10.06542,6.016378
min,-82.325968,-73.999735,-82.299888,-74.002015
25%,-73.992029,40.735483,-73.991756,40.733386
50%,-73.981477,40.752824,-73.979938,40.752687
75%,-73.966317,40.767127,-73.963018,40.767461
max,40.763382,41.366138,40.755657,41.366138


In [7]:
def np_getDistance(A , B , C , D):# 先緯度後經度
    ra = 6378140  # radius of equator: meter
    rb = 6356755  # radius of polar: meter
    flatten = 0.003353 # Partial rate of the earth
    # change angle to radians
    
    radLatA = np.radians(A)
    radLonA = np.radians(B)
    radLatB = np.radians(C)
    radLonB = np.radians(D)
 
    pA = np.arctan(rb / ra * np.tan(radLatA))
    pB = np.arctan(rb / ra * np.tan(radLatB))
    
    x = np.arccos( np.multiply(np.sin(pA),np.sin(pB)) + np.multiply(np.multiply(np.cos(pA),np.cos(pB)),np.cos(radLonA - radLonB)))
    c1 = np.multiply((np.sin(x) - x) , np.power((np.sin(pA) + np.sin(pB)),2)) / np.power(np.cos(x / 2),2)
    c2 = np.multiply((np.sin(x) + x) , np.power((np.sin(pA) - np.sin(pB)),2)) / np.power(np.sin(x / 2),2)
    dr = flatten / 8 * (c1 - c2)
    distance = 0.001 * ra * (x + dr)
    return distance

In [8]:
df['distance_real'] = np_getDistance(df['pickup_latitude'],df['pickup_longitude'],df['dropoff_latitude'],df['dropoff_longitude'])
df.head()

  from ipykernel import kernelapp as app


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,longitude_diff,latitude_diff,distance_2D,distance_real
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,0.009452,-0.002437,0.009761,0.842759
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,-0.001244,0.018265,0.018307,2.030996
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0.003756,-0.007222,0.00814,0.862509
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,0.019292,-0.008437,0.021056,1.878741
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,0.007193,0.03217,0.032964,3.623728


In [9]:
# 檢查是否有缺失值 : 是
np.isnan(df).any()

pickup_longitude     False
pickup_latitude      False
dropoff_longitude    False
dropoff_latitude     False
passenger_count      False
pickup_year          False
pickup_month         False
pickup_day           False
pickup_hour          False
pickup_minute        False
pickup_second        False
longitude_diff       False
latitude_diff        False
distance_2D          False
distance_real         True
dtype: bool

In [10]:
print(df[df['distance_real'].isnull()].shape)
df[df['distance_real'].isnull()]

(133, 15)


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,longitude_diff,latitude_diff,distance_2D,distance_real
30,-73.956138,40.778872,-73.956138,40.778872,1,2012,2,16,20,5,0,0.0,0.0,0.0,
98,0.000000,0.000000,0.000000,0.000000,1,2009,9,21,7,45,0,0.0,0.0,0.0,
100,-73.975015,40.762672,-73.975015,40.762672,2,2014,2,17,18,38,0,0.0,0.0,0.0,
122,-73.978307,40.764860,-73.978307,40.764860,2,2009,7,28,10,58,0,0.0,0.0,0.0,
131,0.000000,0.000000,0.000000,0.000000,1,2011,6,30,6,52,36,0.0,0.0,0.0,
165,0.000000,0.000000,0.000000,0.000000,1,2009,12,11,12,0,40,0.0,0.0,0.0,
171,0.000000,0.000000,0.000000,0.000000,1,2015,1,31,0,8,25,0.0,0.0,0.0,
209,-74.003662,40.740162,-74.003662,40.740162,1,2010,1,24,18,59,0,0.0,0.0,0.0,
224,-74.015805,40.718032,-74.015805,40.718032,5,2010,5,20,16,14,0,0.0,0.0,0.0,
301,-74.006362,40.706027,-74.006362,40.706027,1,2014,2,19,21,41,19,0.0,0.0,0.0,


In [11]:
df['distance_real'] = np.where( ((df['distance_real'].isnull()) & (df['distance_2D'] == 0)), 0, df['distance_real']) 
print(df[df['distance_real'].isnull()].shape)
df[df['distance_real'].isnull()]

(0, 15)


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,longitude_diff,latitude_diff,distance_2D,distance_real


In [12]:
# 檢查是否有缺失值 : 否
np.isnan(df).any()

pickup_longitude     False
pickup_latitude      False
dropoff_longitude    False
dropoff_latitude     False
passenger_count      False
pickup_year          False
pickup_month         False
pickup_day           False
pickup_hour          False
pickup_minute        False
pickup_second        False
longitude_diff       False
latitude_diff        False
distance_2D          False
distance_real        False
dtype: bool

In [13]:
import math
"""
Your Code Here, set new character at df['distance_real']
"""
# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.3681083346622744
Gradient Boosting Reg Score : 0.8023268644244086


# 作業2
* 試著只使用新特徵估計目標值(忽略原特徵)，效果跟作業1的結果比較起來效果如何?

In [14]:
train_X = scaler.fit_transform(df[['distance_real']])
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')
# 結果還較差

Linear Reg Score : 0.001156236904454966
Gradient Boosting Reg Score : 0.718897093621444
