**Background:**

This is my third exercise of forecasting study, compared with my previous practices, which focus on getting the best result by using stacking machine learning and as many features as possible, the main challenge in this competition comes from the huge volume of data, which is  impossible to train 55 million rows of data with 16 GB memory only.

I end up getting 2.995 scores by only using less than half of the training data with training time less than an hour, kindly upvote if you find it useful :)

**References:**

https://www.kaggle.com/code/marinovik/xgboost-new-york-taxi-fares-prediction

https://www.kaggle.com/code/jiridobes/visualization-weather-lgbm-2-90609/notebook

# Import Data

In [None]:
# For processing the data
import numpy as np
import pandas as pd
import datetime as dt

# Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("white") # set style for seaborn plots

# Machine learning
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Ignore warnings
import warnings 
warnings.filterwarnings('ignore')

# Time-related functions
import time

In [None]:
#Pickle file type use less memory than csv
# train = pd.read_csv("../input/new-york-city-taxi-fare-prediction/train.csv", nrows = 20000000,usecols=[1,2,3,4,5,6,7])
# pd.to_pickle(train, "./dummy.pkl") 

In [None]:
train = pd.read_pickle("../input/train-pickle/dummy.pkl")  
print(train.shape)
train.head()

In [None]:
test =  pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv',usecols=[1,2,3,4,5,6])
print(test.shape)
test.head()

# Data Preprocessing

In [None]:
def datetime_info(df):
    #Convert to datetime format
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    df['hour'] = df.pickup_datetime.dt.hour
    df['day'] = df.pickup_datetime.dt.day
    df['month'] = df.pickup_datetime.dt.month
    df['weekday'] = df.pickup_datetime.dt.weekday
    df['year'] = df.pickup_datetime.dt.year
#     df['week'] = df.pickup_datetime.dt.week
    df.drop(['pickup_datetime'],axis=1,inplace=True)
    
    return df

## Test Data

Review of Test dataset will help to remove rows from training set that are beyond the boundary

In [None]:
#From the chart, to confirm to remove any longitude or latitude that is beyond the bondary in training set
# f, ax = plt.subplots(1, 2, figsize=(16, 5))
# sns.scatterplot(x="pickup_longitude", y="pickup_latitude", data=test, 
#                 color="#fdb813", ax=ax[0])
# sns.scatterplot(x="dropoff_longitude", y="dropoff_latitude", data=test, 
#                 color="#fdb813", ax=ax[1])
# ax[0].set_title("Pickup Coordinates")
# ax[1].set_title("Dropoff Coordinates")
# plt.show()

In [None]:
test['passenger_count'].value_counts()

## Train data

In [None]:
datetime_info(train)

In [None]:
train.isnull().sum()

In [None]:
train.dropna(axis=0,subset=['dropoff_latitude','dropoff_latitude'],inplace=True)

In [None]:
train.isnull().sum().sum()

In [None]:
#Drop longitude that is less than -76 or above which the above code faill to filter
# train.drop(train[(train['dropoff_longitude'] < -74.4) |(train['dropoff_longitude'] > -72.8)].index, axis=0, inplace = True)
# train.drop(train[(train['dropoff_latitude'] < 40.4) |(train['dropoff_latitude'] > 41.8)].index, axis=0, inplace = True)
# train.drop(train[(train['pickup_longitude'] < -74.4) |(train['pickup_longitude'] > -72.8)].index, axis=0, inplace = True)
# train.drop(train[(train['pickup_latitude'] < 40.4) |(train['pickup_latitude'] > 41.8)].index, axis=0, inplace = True)

In [None]:
#Any geometry not within test set boundary is dropped
train.drop(train[(train['dropoff_longitude'] < -74.4) |(train['dropoff_longitude'] > -72.8) |
                (train['dropoff_latitude'] < 40.4) |(train['dropoff_latitude'] > 41.8)|
                (train['pickup_longitude'] < -74.4) |(train['pickup_longitude'] > -72.8)|
                (train['pickup_latitude'] < 40.4) |(train['pickup_latitude'] > 41.8)].index, axis=0, inplace = True)

In [None]:
# fare less than 1 dollar is considered unusual fare as taxi fare always has starting fee
train.drop(train[(train['fare_amount'] < 2.5) | (train['fare_amount'] > 500)].index, axis=0, inplace = True)

In [None]:
#Align passenger count with test set
train.drop(train[(train['passenger_count'] == 0) | (train['passenger_count'] > 6)].index, axis=0, inplace = True)

In [None]:
train.dtypes

In [None]:
import gc

gc.collect()

In [None]:
# from geopy.geocoders import Nominatim

# geolocator = Nominatim(user_agent="geoapiExercises")

In [None]:
# initialize Nominatim API
# geolocator = Nominatim(user_agent="geoapiExercises")

# Latitude = "40.721319"
# Longitude = "-73.844311"
 
# location = geolocator.reverse(Latitude+","+Longitude)
 
# # Display
# print(location)
# print(location.raw['address'])

In [None]:
# def find_location(col1,col2):
#     location = geolocator.reverse(col1+","+col2)
#     return location.raw['address']

In [None]:
import seaborn as sns
# sns.boxplot('week','fare_amount',data=train[(train['year']==2014) & (train['fare_amount']<100)])
# plt.show()

In [None]:
#Weekday does not seem to impact on the fare, except 0 & 6 have longer tail
sns.boxplot('weekday','fare_amount',data=train[(train['year']==2014) & (train['fare_amount']<100)])
plt.show()

In [None]:
#Grouping 6-20 (normal rate) & 21 -5 (higher rate) into 2 groups based on chart
sns.boxplot('hour','fare_amount',data=train[(train['year']==2014) & (train['fare_amount']<100)])
plt.show()

In [None]:
def is_weekend(x):
    if x == 0 or x ==6:
        return 1
    else:
        return 0

In [None]:
train['weekday'] = train['weekday'].apply(is_weekend)

In [None]:
def is_night(x):
    if x >= 6 and x <= 20:
        return 0
    else:
        return 1

In [None]:
train['hour'] = train['hour'].apply(is_night)

In [None]:
def sphere_dist(pick_lat, pick_lon, drop_lat, drop_lon):
    R_earth = 6371 # Earth radius (in km)
    # Convert degrees to radians
    pick_lat, pick_lon, drop_lat, drop_lon = map(np.radians, [pick_lat, pick_lon,
                                                              drop_lat, drop_lon])
    # Compute distances along lat, lon dimensions
    dlat = drop_lat - pick_lat
    dlon = drop_lon - pick_lon
    
    # Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pick_lat) * np.cos(drop_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

In [None]:
train['distance'] = sphere_dist(train['pickup_latitude'], train['pickup_longitude'],train['dropoff_latitude'], train['dropoff_longitude'])

In [None]:
train['year'] = 2015 - train['year'] 

In [None]:
train['geo_pick'] = train['pickup_longitude'] / train['pickup_latitude']
train['geo_drop'] = train['dropoff_longitude'] / train['dropoff_latitude']
# train.drop(['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1,inplace=True)

In [None]:
#Distance for Tourist sport such as JFK airport, the LaGuardia airport, the Newark airport, Times Square, Central Park, the Statue of Liberty, 
#Grand Central, the MET museum, and the World Trade Center.
#No able to use all features due to limitation of memory

train['jfk_dist'] = sphere_dist(train['dropoff_latitude'], train['dropoff_longitude'],40.6413,-73.7781)
train['lga_dist'] = sphere_dist(train['dropoff_latitude'], train['dropoff_longitude'],40.7769,-73.8740)
train['ewr_dist'] = sphere_dist(train['dropoff_latitude'], train['dropoff_longitude'],40.6895,-74.1745)
train['tsq_dist'] = sphere_dist(train['dropoff_latitude'], train['dropoff_longitude'],40.7580,-73.9855)
# train['cpk_dist'] = sphere_dist(train['dropoff_latitude'], train['dropoff_longitude'],40.7812,-73.9665)
# train['lib_dist'] = sphere_dist(train['dropoff_latitude'], train['dropoff_longitude'],40.6892,-74.0445)
# train['gct_dist'] = sphere_dist(train['dropoff_latitude'], train['dropoff_longitude'],40.7527,-73.9772)
# train['met_dist'] = sphere_dist(train['dropoff_latitude'], train['dropoff_longitude'],40.7794,-73.9632)
# train['wtc_dist'] = sphere_dist(train['dropoff_latitude'], train['dropoff_longitude'],40.7126,-74.0099)

In [None]:
train.dtypes

In [None]:
train.head()

In [None]:
#The minimum taxi fare should be 2.50, and there are many records with 0 distance and high fare

# year = [2009, 2010, 2011, 2012, 2013, 2014, 2015]
# for y in year:
#     print(y)
#     display(train[(train['year']==y) & (train['distance']==0)]['fare_amount'].describe())

In [None]:
#Split into x & y will cause memory to burst by adding more features
x_train, y_train = train.drop(['fare_amount'],axis=1), train['fare_amount']
print(x_train.shape, y_train.shape)
x_train.head()

In [None]:
del train
gc.collect()

## Test Data

In [None]:
datetime_info(test)

In [None]:
test['weekday'] = test['weekday'].apply(is_weekend)
test['hour'] = test['hour'].apply(is_night)

In [None]:
test['year'] = 2015 - test['year']

In [None]:
test['distance'] = sphere_dist(test['pickup_latitude'], test['pickup_longitude'],test['dropoff_latitude'], test['dropoff_longitude'])

In [None]:
#There are 85 rows with 0 distance, mean the passenger pick up & drop in the same place
# test[test['distance']==0]

In [None]:
test['distance'].describe()

In [None]:
test['geo_pick'] = test['pickup_longitude'] / test['pickup_latitude']
test['geo_drop'] = test['dropoff_longitude'] / test['dropoff_latitude']
# test.drop(['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1,inplace=True)

In [None]:
test['jfk_dist'] = sphere_dist(test['dropoff_latitude'], test['dropoff_longitude'],40.6413,-73.7781)
test['lga_dist'] = sphere_dist(test['dropoff_latitude'], test['dropoff_longitude'],40.7769,-73.8740)
test['ewr_dist'] = sphere_dist(test['dropoff_latitude'], test['dropoff_longitude'],40.6895,-74.1745)
test['tsq_dist'] = sphere_dist(test['dropoff_latitude'], test['dropoff_longitude'],40.7580,-73.9855)

In [None]:
print(test.shape)
test.head()

# Model

In [None]:
import lightgbm as lgb

#validation best round 1993, 1766 with geo feature, 2000 rounds with extra 4 features score 2.995
params = {"objective" : "regression",
          "boosting" : "gbdt", 
          "metric" : "rmse",
          "num_iterations" : 2000,
          "top_k" : 30, 
          "max_depth" : 8, 
          "num_leaves" : 250, 
          "min_data_in_leaf" : 20, 
          "learning_rate" : 0.05,
          "bagging_fraction" : 0.7, 
          "bagging_seed" : 3,
          "bagging_freq" : 5, 
          "feature_fraction" : 0.5, 
          "num_threads" : 4
         }

dataset_params = {"max_bin" : 200, 
                  "min_data_in_bin" : 3 
                 }

In [None]:
#Test run on single to determine a fixed round for all years
from sklearn.model_selection import train_test_split

# x_train1, x_val, y_train1, y_val = train_test_split(train.drop(['fare_amount'],axis=1), train['fare_amount'] , random_state=42, test_size=0.2)

# del train
# gc.collect()

In [None]:
# lgb_train = lgb.Dataset(x_train1, y_train1, params=dataset_params,free_raw_data=True)
# lgb_val = lgb.Dataset(x_val, y_val, params=dataset_params, free_raw_data=True)

# lgb_train.save_binary('train_data.bin')
# lgb_train = lgb.Dataset('train_data.bin')

# #When dump validation data to bin will caused bin mapping issue and lgb will stop training
# # lgb_val.save_binary('val_data.bin')
# # lgb_val = lgb.Dataset('val_data.bin')

# #Free up memory
# del x_train1
# del x_val
# gc.collect()

In [None]:
#save dataset to bin will prevent lgb convert data in float64 during training, which cause huge spike of usage in memory
lgb_train = lgb.Dataset(x_train, y_train, params=dataset_params, free_raw_data=True)
lgb_train.save_binary('train_data.bin')
lgb_train = lgb.Dataset('train_data.bin')
del x_train
gc.collect()

In [None]:
# model = lgb.train(params, lgb_train, verbose_eval=50, keep_training_booster=True,valid_sets=[lgb_val],callbacks=[lgb.early_stopping(stopping_rounds=100)])

model = lgb.train(params, lgb_train, verbose_eval=50, keep_training_booster=True)

In [None]:
pred = model.predict(test)

# Submission

In [None]:
sub =  pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv',usecols=[0])
sub['fare_amount'] = pred

In [None]:
#Minimum fare should be 2.5
sub['fare_amount'] = sub['fare_amount'].apply(lambda x:2.5 if x < 2.5 else x)
sub['fare_amount'].describe()

In [None]:
sub.to_csv('taxi_fare_submission.csv',index=False)
sub.head()