In [None]:
import numpy as np 
import pandas as pd 
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

%matplotlib inline

seed = 42

In [None]:
#15000000
train = pd.read_csv('../input/train.csv', nrows = 8000000, parse_dates = ['pickup_datetime'])

In [None]:
# Remove NAs
train = train.dropna()

In [None]:
# Got help from:
# https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration
# for location min/max
def select_within_boundingbox(df, BB):
    return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])
            
# load image of NYC map
BB = (-74.5, -72.8, 40.5, 41.8)

In [None]:
train = train[select_within_boundingbox(train, BB)]

In [None]:
# Remove outliers 
# Code taken from Will Koehrsen
# Do check out his other amazing works!
# https://www.kaggle.com/willkoehrsen/a-walkthrough-and-a-challenge/notebook

train = train[train['fare_amount'].between(left = 2.50, right = 100)]
train = train.loc[train['passenger_count'] < 10]

In [None]:
def transform_time(df):
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['hour'] = df['pickup_datetime'].dt.hour
    df['minute'] = df['pickup_datetime'].dt.minute
    df['second'] = df['pickup_datetime'].dt.second
    df['weekday'] = df['pickup_datetime'].dt.weekday
    df['PM'] = (df['hour'] >= 12).apply(int)
    df['dayofyear'] = df['pickup_datetime'].dt.dayofyear
    df['daysinmonth'] = df['pickup_datetime'].dt.daysinmonth
    df['is_leap_year'] = (df['pickup_datetime'].dt.is_leap_year).apply(int)
    df['weekofyear'] = df['pickup_datetime'].dt.weekofyear
    df['dayofweek'] = df['pickup_datetime'].dt.dayofweek
    df['daysinyear'] = df['is_leap_year'].apply(lambda x : 366 if x == 1 else 365)
    df['frac_day'] = (df['hour'] + (df['minute'] / 60) + (df['second'] / 3600)) / 24
    df['frac_week'] = (df['dayofweek'] + df['frac_day']) / 7
    df['frac_month'] = (df['day'] + (df['frac_day'])) / (df['daysinmonth'] +  1)
    df['frac_year'] = (df['dayofyear'] + df['frac_day']) / (df['daysinyear'] + 1)
    return df

In [None]:
def chebyshev(pickup_long, dropoff_long, pickup_lat, dropoff_lat):
    return np.maximum(np.absolute(pickup_long - dropoff_long), np.absolute(pickup_lat - dropoff_lat))

def haversine(pickup_long, dropoff_long, pickup_lat, dropoff_lat):
    dlon = (pickup_long - dropoff_long) * 0.0174533
    dlat = (pickup_lat - dropoff_lat) * 0.0174533
    lat1 = dropoff_lat
    lat2 = pickup_lat
    return 2.0 * 6371.0 * np.arcsin(np.power(np.power(np.sin(dlat/2.0),2) + np.cos(lat1) * np.cos(lat2) * np.power(np.sin(dlon/2.0),2), 0.5))

def rotate_long(long, lat):
    degree = 36.1
    radian = degree * 0.0174533
    return  long * np.cos(radian) - lat * np.sin(radian)

def rotate_lat(long, lat):
    degree = 36.1
    radian = degree * 0.0174533
    return  long * np.sin(radian) - lat * np.cos(radian)

In [None]:
def transform_distance(df):
    df['abs_lat_diff'] = (df['dropoff_latitude'] - df['pickup_latitude']).abs()
    df['abs_lon_diff'] = (df['dropoff_longitude'] - df['pickup_longitude']).abs()
    df['Chebyshev'] = chebyshev(df['pickup_longitude'], df['dropoff_longitude'], df['pickup_latitude'], df['dropoff_latitude'])
    df['Haversine'] = haversine(df['pickup_longitude'], df['dropoff_longitude'], df['pickup_latitude'], df['dropoff_latitude'])
    
    nyc = (-74.0063889, 40.7141667)
    jfk = (-73.7822222222, 40.6441666667)
    ewr = (-74.175, 40.69)
    lgr = (-73.87, 40.77)
    
    df['cheb_nyc'] = chebyshev(nyc[0], df['dropoff_longitude'], nyc[1], df['dropoff_latitude'])
    df['cheb_jfk'] = chebyshev(jfk[0], df['dropoff_longitude'], jfk[1], df['dropoff_latitude'])
    df['cheb_ewr'] = chebyshev(ewr[0], df['dropoff_longitude'], ewr[1], df['dropoff_latitude'])
    df['cheb_lgr'] = chebyshev(lgr[0], df['dropoff_longitude'], lgr[1], df['dropoff_latitude'])

    df['hav_nyc'] = haversine(nyc[0], df['dropoff_longitude'], nyc[1], df['dropoff_latitude'])
    df['hav_jfk'] = haversine(jfk[0], df['dropoff_longitude'], jfk[1], df['dropoff_latitude'])
    df['hav_ewr'] = haversine(ewr[0], df['dropoff_longitude'], ewr[1], df['dropoff_latitude'])
    df['hav_lgr'] = haversine(lgr[0], df['dropoff_longitude'], lgr[1], df['dropoff_latitude'])

    df['rotate_pickup_longitude'] = rotate_long(df['pickup_longitude'], df['pickup_latitude'])
    df['rotate_pickup_latitude'] = rotate_lat(df['pickup_longitude'], df['pickup_latitude'])
    df['rotate_dropoff_longitude'] = rotate_long(df['dropoff_longitude'], df['dropoff_latitude'])
    df['rotate_dropoff_latitude'] = rotate_lat(df['dropoff_longitude'], df['dropoff_latitude'])
    return df

In [None]:
train = transform_time(train)
train = transform_distance(train)
train.drop(['pickup_datetime', 'key'], axis = 1, inplace = True)
target = train.pop('fare_amount')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = seed)

In [None]:
print('X_train shape:', X_train.shape)
print('X_val shape:', X_val.shape)
print('y_train shape:', y_train.shape)
print('y_val shape:', y_val.shape)

In [None]:
reg = xgb.XGBRegressor(eval_metric = 'rmse',
                       max_depth = 7,
                       subsample = 0.8,
                       learning_rate = 0.1,
                       gamma = 1,
                       colsample_bytree = 0.9,
                       random_state = seed, 
                       n_jobs = -1)

In [None]:
reg.fit(X_train, y_train)

In [None]:
train_pred = reg.predict(X_train)
val_pred = reg.predict(X_val)

rmse_train = np.sqrt(mean_squared_error(y_train, train_pred))
print('Training RMSE:', rmse_train)

rmse_val = np.sqrt(mean_squared_error(y_val, val_pred))
print('Validation RMSE:', rmse_val)

In [None]:
test = pd.read_csv('../input/test.csv', parse_dates = ['pickup_datetime'])
test = transform_time(test)
test = transform_distance(test)
test.drop(['pickup_datetime'], axis = 1, inplace = True)
test_index = test.pop('key')

In [None]:
result = reg.predict(test)

In [None]:
submission = pd.DataFrame({
    'key' : test_index,
    'fare_amount' : result.flatten()
})
submission.to_csv('solution.csv', index = False)

In [None]:
# CV RMSE score
"""
cv = cross_val_score(clf, train, target, cv = 5, scoring = 'neg_mean_squared_error')
cv = np.sqrt(-1 * cv)
print(cv)
print(cv.mean())
"""

In [None]:
print('Plotting Feature Importance')
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(reg, max_num_features=100, height=0.8, ax=ax)
ax.grid(False)
plt.title("XGBoost - Feature Importance", fontsize=15)
plt.show()
plt.savefig('feature_importance.png')