In [None]:
import pandas as pd 
import numpy as np
import datetime as dt
import os
from sklearn.model_selection import train_test_split

import lightgbm as lgb
import gc

from tqdm import tqdm

In [None]:
TRAIN_PATH = '../input/new-york-city-taxi-fare-prediction/train.csv'
TEST_PATH = '../input/new-york-city-taxi-fare-prediction/test.csv'
SAMPLE_SUBMISSION_PATH = '../input/new-york-city-taxi-fare-prediction/sample_submission.csv'
cols = ['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
        'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
chunksize = 5e6

# Reduce mem usage -> Train data

In [None]:
df_list = [] # list to hold the batch dataframe

for df_chunk in tqdm(pd.read_csv(TRAIN_PATH, usecols=cols, chunksize=chunksize, dtype={'passenger_count':np.int8})):
    df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
    df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    df_list.append(df_chunk) 
    


In [None]:
# Merge all dataframes into one dataframe
train_df = pd.concat(df_list)
print(train_df.shape)
train_df = train_df[:20000000]
# Delete the dataframe list to release memory
del df_list

In [None]:
def calculate_haversine(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    
    return 2 * R_earth * np.arcsin(np.sqrt(a))

In [None]:
def preprocessing(df, bbox, is_train=True):
    """
    data preprocessing both train set and test set
    """
    df.dropna(inplace=True)
    
    # 2. longitude & latitude    
    
    # There are some strange values that can think 'longitude, latitude have to swap'
    if is_train:
        # bbox
        df = df[(df.pickup_longitude >= bbox[0]) & (df.pickup_longitude <= bbox[1]) &
                    (df.pickup_latitude >= bbox[2]) & (df.pickup_latitude <= bbox[3]) & 
                    (df.dropoff_longitude >= bbox[0]) & (df.dropoff_longitude <= bbox[1]) & 
                    (df.dropoff_latitude >= bbox[2]) & (df.dropoff_latitude <= bbox[3])]

        # 3. passenger
        # NYC Taxi : 1 ~ 4, YelloRide : 1 ~ 7, Uber : 1 ~ 7
        # In test set, maximum value of passenger_count == 6 -> passenger_count 7 : 13 rows in train set-> drop
        df = df[(df.passenger_count > 0) & (df.passenger_count < 7)]
    
    # 4. datetime
    df['hour'] = df.pickup_datetime.dt.hour.astype(np.int8)
    df['day'] = df.pickup_datetime.dt.day.astype(np.int8)
    df['month'] = df.pickup_datetime.dt.month.astype(np.int8)
    df['weekday'] = df.pickup_datetime.dt.weekday.astype(np.int8)
    df['year'] = df.pickup_datetime.dt.year.astype(np.int16)

    df.drop(columns=['pickup_datetime'], inplace=True)   
    df.reset_index(inplace=True)
    
    gc.collect()
    
    # 5. Haversine    
    df['haversine'] = calculate_haversine(df.pickup_latitude, df.pickup_longitude, df.dropoff_latitude, df.dropoff_longitude).astype(np.float16)
    
    return df
    

# Data cleaning, make haversine distance column

In [None]:
#tqdm.pandas()

cols_test = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude',
        'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
test = pd.read_csv(TEST_PATH, usecols=cols_test,dtype={'passenger_count':np.int8})

test['pickup_datetime'] = test['pickup_datetime'].str.slice(0, 16)
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')

box = (min(test.pickup_longitude.min(), test.dropoff_longitude.min()),
        max(test.pickup_longitude.max(), test.dropoff_longitude.max()),
        min(test.pickup_latitude.min(), test.dropoff_latitude.min()),
        max(test.pickup_latitude.max(), test.dropoff_latitude.max())
        )

test = preprocessing(test, box, False)
test.drop(columns=['index'], inplace=True)


# 1. fare -> initial charge of NYC taxi : 2.5
train_df = train_df[(train_df.fare_amount >= 2.5) & (train_df.fare_amount < 100)]
train = preprocessing(train_df, box)
len_train = train.shape[0]

gc.collect()
del train_df


# Add Distance From Popular Landmarks

In [None]:
jfk_coord = (40.639722, -73.778889) # JFK airport
ewr_coord = (40.6925, -74.168611)   # Newark Liberty International Airport
lga_coord = (40.77725, -73.872611)  # LaGuardia airport
met_coord = (40.7794, -73.9632)     # MET museum
wtc_coord = (40.7126, -74.0099)     # World Trade Center

In [None]:
def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
    lat, lon = landmark_lonlat
    df[landmark_name + '_drop_distance'] = calculate_haversine(lat, lon, df.dropoff_latitude, df.dropoff_longitude).astype(np.float16)

In [None]:
for a_df in [train, test]:
    for name, lonlat in [('jfk', jfk_coord), ('lga', lga_coord), ('ewr', ewr_coord), ('met', met_lonlat), ('wtc', wtc_lonlat)]:
        add_landmark_dropoff_distance(a_df, name, lonlat)

# Lightgbm

In [None]:
# Lightgbm
# # len_train = 54073228
# # val_pred = np.zeros(len_train)
# # test = pd.read_csv(AFTER_TEST_PATH)
# test_pred = np.zeros(test.shape[0])
# verbose = 1000
# random_seed = 42
# # chunksize = 13500000
# test_size = 0.1 # -> 0.01 ?
# # k = len_train // chunksize + 1
# # k = len_train // chunksize
# idx = 0

In [None]:
# Lightgbm
# verbose = 100
# y = train['fare_amount']
# train.drop(columns=['fare_amount', 'index'], inplace=True)
# x_train, x_test, y_train, y_test = train_test_split(train, y, random_state=random_seed, test_size=test_size)
    
# # Hyperparameters Tuning in LightGBM docs

# 
# params = {
#         'objective': 'regression',
#         "boosting" : "gbdt",
#         'metric': 'rmse',
#         'max_depth' : 7,
#         #'min_data_in_leaf' : 20,
#         'num_leaves':63,
#         'learning_rate': 0.05,
#         #'subsample': 0.8,
#         #"bagging_fraction" : 0.7, 
#         #"bagging_seed" : 3,
#         #"bagging_freq" : 5, 
#         #"feature_fraction" : 0.5, 
#         "num_threads" : 4,
#         # 'max_bin' : 50,      # default 255, smaller -> deal with overfitting
# #         'min_split_gain': 0.5,
# #         'min_child_weight': 1,
# #         'min_child_samples': 10,
# #         'scale_pos_weight':1,
# #         'zero_as_missing': True,
#         'seed':random_seed,
#         # 'num_rounds':50000,
#         'device': 'gpu',
#         'gpu_platform_id': 0,
#         'gpu_device_id': 0,
#         #'random_state':random_seed
        
# }
# train_set = lgb.Dataset(x_train, label=y_train)
# del x_train, y_train
# valid_set = lgb.Dataset(x_test, label=y_test)
# gc.collect()
    
# evals_result = {}
# model = lgb.train(params
#                         , train_set                     
#                         , num_boost_round=3500
#                         , valid_sets=[valid_set]
#                         , verbose_eval=verbose                        
#                         , early_stopping_rounds=125
#                         , callbacks=[lgb.record_evaluation(evals_result), lgb.early_stopping(stopping_rounds=10)]
#                         )

# validation = model.predict(x_test, num_iteration=model.best_iteration)
# del x_test
# test_prediction = model.predict(test, num_iteration=model.best_iteration)
# print("validation rmse", "{0:.5f}".format(np.sqrt(np.mean((validation - y_test)**2))))
# del y_test
# # test_pred += test_prediction / k

In [None]:
# lightgbm
# submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
# submission['fare_amount'] = test_prediction
# submission['fare_amount'] = submission['fare_amount'].apply(lambda x:2.5 if x < 2.5 else x)
# submission.to_csv("submission.csv", index=False)

# Xgboost

In [None]:
import xgboost as xgb 
def XGBmodel(x_train,x_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

params = {
    # Parameters that we are going to tune.
    'max_depth': 8, #Result of tuning with CV
    'eta':.05, #Result of tuning with CV
    'subsample': 1, #Result of tuning with CV
    'colsample_bytree': 0.8, #Result of tuning with CV
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 1
}
verbose = 1000
random_seed = 42
test_size = 0.05
y = train['fare_amount']
train.drop(columns=['fare_amount', 'index'], inplace=True)
x_train, x_test, y_train, y_test = train_test_split(train, y, random_state=random_seed, test_size=test_size)
    
model = XGBmodel(x_train,x_test,y_train,y_test,params)

In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
test_pred = model.predict(xgb.DMatrix(test), ntree_limit = model.best_ntree_limit)
submission['fare_amount'] = test_pred
submission['fare_amount'] = submission['fare_amount'].apply(lambda x:2.5 if x < 2.5 else x)
submission.to_csv("submission.csv", index=False)