In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb

import pickle
from geopy.geocoders import Nominatim
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [4]:
sample_df = pd.read_csv("train.csv")

In [5]:
sample_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [19]:
#Convert character variables to numeric 
f = lambda x: 0 if x == 'N' else 1

sample_df["store_and_fwd_flag"] = sample_df["store_and_fwd_flag"].apply(lambda x: f(x))   

In [20]:
#First, convert datetime strings into datetime
sample_df["dropoff_datetime"] = pd.to_datetime(sample_df["dropoff_datetime"], format='%Y-%m-%d %H:%M:%S')
sample_df["pickup_datetime"] = pd.to_datetime(sample_df["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')

In [21]:
#Now construct other variables, like month, date, etc.
sample_df["pickup_month"] = sample_df["pickup_datetime"].dt.month
sample_df["pickup_day"] = sample_df["pickup_datetime"].dt.day
sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday #sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday_name
sample_df["pickup_hour"] = sample_df["pickup_datetime"].dt.hour
sample_df["pickup_minute"] = sample_df["pickup_datetime"].dt.minute

In [22]:
#Get latitude and longitude differences 
sample_df["latitude_difference"] = sample_df["dropoff_latitude"] - sample_df["pickup_latitude"]
sample_df["longitude_difference"] = sample_df["dropoff_longitude"] - sample_df["pickup_longitude"]

In [23]:
sample_df["trip_duration"] = sample_df["trip_duration"].apply(lambda x: round(x/60)) 

In [24]:
#Convert trip distance from longitude and latitude differences to Manhattan distance.
sample_df["trip_distance"] = 0.621371 * 6371 * (abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
                                     abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2)))))))

In [25]:
sample_df.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,0,3,14,0,17,24,-0.002335,0.017525,1.372146
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,0,6,12,6,0,43,-0.007412,-0.019066,1.82944
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,1,1,19,1,11,35,-0.053852,-0.026306,5.538397
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,0,4,6,2,19,32,-0.013252,-0.002228,1.069567
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,0,3,26,5,13,30,-0.010689,0.00013,0.747485


In [35]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("cabaki/knycmetars2016")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\DELL\.cache\kagglehub\datasets\cabaki\knycmetars2016\versions\1


In [36]:

import os
# Check files in the dataset folder
print("Files in dataset:", os.listdir(path))

# Pick the CSV file (replace with the actual filename you see)
csv_file = [f for f in os.listdir(path) if f.endswith(".csv")][0]
csv_path = os.path.join(path, csv_file)

# Load the CSV
weather_df = pd.read_csv(csv_path)

# Show first 5 rows
print(weather_df.head())

# Optional: see basic info about dataset
print(weather_df.info())


Files in dataset: ['KNYC_Metars.csv']
                  Time  Temp.  Windchill  Heat Index  Humidity  Pressure  \
0  2015-12-31 02:00:00    7.8        7.1         NaN      0.89    1017.0   
1  2015-12-31 03:00:00    7.2        5.9         NaN      0.90    1016.5   
2  2015-12-31 04:00:00    7.2        NaN         NaN      0.90    1016.7   
3  2015-12-31 05:00:00    7.2        5.9         NaN      0.86    1015.9   
4  2015-12-31 06:00:00    7.2        6.4         NaN      0.90    1016.2   

   Dew Point  Visibility  Wind Dir  Wind Speed  Gust Speed  Precip Events  \
0        6.1         8.0       NNE         5.6         0.0     0.8    NaN   
1        5.6        12.9  Variable         7.4         0.0     0.3    NaN   
2        5.6        12.9      Calm         0.0         0.0     0.0    NaN   
3        5.0        14.5        NW         7.4         0.0     0.0    NaN   
4        5.6        11.3      West         5.6         0.0     0.0    NaN   

  Conditions  
0   Overcast  
1   Overcast

In [37]:
weather_df["Time"] = pd.to_datetime(weather_df["Time"])
weather_df["pickup_year"] = weather_df["Time"].dt.year
weather_df["pickup_month"] = weather_df["Time"].dt.month
weather_df["pickup_day"] = weather_df["Time"].dt.day
weather_df["pickup_hour"] = weather_df["Time"].dt.hour

In [38]:
weather_df.head(2)

Unnamed: 0,Time,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions,pickup_year,pickup_month,pickup_day,pickup_hour
0,2015-12-31 02:00:00,7.8,7.1,,0.89,1017.0,6.1,8.0,NNE,5.6,0.0,0.8,,Overcast,2015,12,31,2
1,2015-12-31 03:00:00,7.2,5.9,,0.9,1016.5,5.6,12.9,Variable,7.4,0.0,0.3,,Overcast,2015,12,31,3


In [39]:
#Since I am looking at year 2016, I want to keep everything in this year, do this check just in case.
weather_df = weather_df[weather_df["pickup_year"] == 2016]
weather_df.head(2)

Unnamed: 0,Time,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions,pickup_year,pickup_month,pickup_day,pickup_hour
22,2016-01-01 00:00:00,5.6,3.2,,0.58,1018.8,-2.2,16.1,WNW,11.1,0.0,0.0,,Overcast,2016,1,1,0
23,2016-01-01 01:00:00,5.6,4.0,,0.53,1018.5,-3.3,16.1,Variable,7.4,0.0,0.0,,Overcast,2016,1,1,1


In [40]:
#Merge weather data with my dataframe
sample_df = pd.merge(sample_df, weather_df[["Temp.", "pickup_month", "pickup_day", "pickup_hour", "Windchill", 
                                            "Humidity", "Pressure", "Dew Point", "Visibility", "Wind Dir", 
                                            "Wind Speed", "Gust Speed", "Precip", "Conditions"]], 
                                             how = "left", on = ["pickup_month", "pickup_day", "pickup_hour"])


In [41]:
sample_df.head(3)


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance,Temp.,Windchill,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Conditions
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,0,3,14,0,17,24,-0.002335,0.017525,1.372146,4.4,-0.5,0.86,1017.5,2.2,8.0,ENE,27.8,57.4,0.3,Overcast
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,0,6,12,6,0,43,-0.007412,-0.019066,1.82944,28.9,,0.53,1006.6,18.3,16.1,West,7.4,0.0,0.0,Unknown
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,1,1,19,1,11,35,-0.053852,-0.026306,5.538397,-6.7,-14.3,0.46,1016.3,-16.7,16.1,West,24.1,46.3,0.0,Clear


In [44]:
#Look at weather conditions
sample_df["Conditions"].unique()

array(['Overcast', 'Unknown', 'Clear', 'Heavy Rain', 'Haze',
       'Partly Cloudy', 'Mostly Cloudy', 'Light Rain', 'Light Snow',
       'Scattered Clouds', 'Snow', 'Rain', 'Heavy Snow',
       'Light Freezing Rain', 'Light Freezing Fog'], dtype=object)

In [45]:
#Codify weather conditions into buckets
sample_df["Conditions"] = sample_df["Conditions"].fillna('Unknown')

weather_dict = {'Overcast' : 0, 
                'Haze' : 0,
                'Partly Cloudy' : 0, 
                'Mostly Cloudy' : 0, 
                'Scattered Clouds' : 0, 
                'Light Freezing Fog' : 0,
                
                'Unknown' : 1,
                'Clear' : 2, 
                
                'Heavy Rain' : 3, 
                'Rain' : 3, 
                'Light Freezing Rain' : 3,
                'Light Rain' : 3, 
                
                'Heavy Snow' : 4,
                'Light Snow' : 4,
                'Snow' : 4}

In [46]:
#Transform the column
sample_df["Conditions"] = sample_df["Conditions"].apply(lambda x: weather_dict[x]) 

In [47]:
#Look at wind directions
sample_df["Wind Dir"].unique()

array(['ENE', 'West', 'South', 'Variable', 'SW', 'Calm', 'North', 'WSW',
       'East', nan, 'WNW', 'NW', 'ESE', 'NE', 'SSW', 'SSE', 'SE', 'NNE',
       'NNW'], dtype=object)

In [48]:
#Codify wind directions
sample_df["Wind Dir"] = sample_df["Wind Dir"].fillna('Unknown')

wind_dir_dict = {'East' : 0,
                 'ENE' : 0, 
                 'ESE' : 0, 
                 
                 'West' : 1, 
                 'WSW' : 1,
                 'WNW' : 1,
                 
                 'South' : 2, 
                 'SSE' : 2,   
                 'SSW' : 2,
                 
                 'North' : 3, 
                 'NNE' : 3, 
                 'NNW' : 3,
                 
                 'Variable' : 4, 
                 'Calm' : 5, 
                 'SW' : 6, 
                 'NW' : 7, 
                 'NE' : 8, 
                 'SE' : 9, 
                 'Unknown' : 10
                }

In [49]:
#And transform the column
sample_df["Wind Dir"] = sample_df["Wind Dir"].apply(lambda x: wind_dir_dict[x]) 

In [50]:
sample_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance,Temp.,Windchill,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Conditions
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,0,3,14,0,17,24,-0.002335,0.017525,1.372146,4.4,-0.5,0.86,1017.5,2.2,8.0,0,27.8,57.4,0.3,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,0,6,12,6,0,43,-0.007412,-0.019066,1.82944,28.9,,0.53,1006.6,18.3,16.1,1,7.4,0.0,0.0,1
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,1,1,19,1,11,35,-0.053852,-0.026306,5.538397,-6.7,-14.3,0.46,1016.3,-16.7,16.1,1,24.1,46.3,0.0,2
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,0,4,6,2,19,32,-0.013252,-0.002228,1.069567,7.2,3.3,0.39,1019.1,-6.1,16.1,2,25.9,35.2,0.0,2
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,0,3,26,5,13,30,-0.010689,0.00013,0.747485,9.4,,0.46,1026.9,-1.7,16.1,4,9.3,0.0,0.0,2


In [51]:
X = sample_df.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime"], axis=1)
y = sample_df["trip_duration"]

In [52]:
#Split the data into training, test, and valdiation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [53]:
#Define evaluation metric
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5


In [54]:
#XGBoost parameters 
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [55]:
nrounds = 2000

In [56]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

#this is for tracking the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [57]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "feval", "silent" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	eval-rmse:0.21029	train-rmse:0.21149
[1]	eval-rmse:0.20651	train-rmse:0.20707
[2]	eval-rmse:0.20301	train-rmse:0.20291
[3]	eval-rmse:0.19973	train-rmse:0.19889
[4]	eval-rmse:0.19714	train-rmse:0.19556
[5]	eval-rmse:0.19446	train-rmse:0.19227
[6]	eval-rmse:0.19230	train-rmse:0.18948
[7]	eval-rmse:0.19033	train-rmse:0.18672
[8]	eval-rmse:0.18824	train-rmse:0.18379
[9]	eval-rmse:0.18633	train-rmse:0.18114
[10]	eval-rmse:0.18457	train-rmse:0.17880
[11]	eval-rmse:0.18284	train-rmse:0.17619
[12]	eval-rmse:0.18121	train-rmse:0.17378
[13]	eval-rmse:0.17974	train-rmse:0.17151
[14]	eval-rmse:0.17853	train-rmse:0.16976
[15]	eval-rmse:0.17734	train-rmse:0.16804
[16]	eval-rmse:0.17642	train-rmse:0.16659
[17]	eval-rmse:0.17557	train-rmse:0.16496
[18]	eval-rmse:0.17465	train-rmse:0.16355
[19]	eval-rmse:0.17380	train-rmse:0.16212
[20]	eval-rmse:0.17293	train-rmse:0.16053
[21]	eval-rmse:0.17221	train-rmse:0.15901
[22]	eval-rmse:0.17156	train-rmse:0.15770
[23]	eval-rmse:0.17104	train-rmse:0.15681
[2

In [58]:
#Test predictions
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [59]:
#Use mean absolute error to get a basic estimate of the error
mae = (abs(pred - y_test)).mean()
mae

np.float64(0.10072644062033889)

In [60]:
#Take a look at feature importance
feature_scores = gbm.get_fscore()
feature_scores

{'passenger_count': 362512.0,
 'pickup_longitude': 1045040.0,
 'pickup_latitude': 946540.0,
 'dropoff_longitude': 894464.0,
 'dropoff_latitude': 831815.0,
 'store_and_fwd_flag': 9603.0,
 'pickup_month': 227776.0,
 'pickup_day': 413667.0,
 'pickup_weekday': 259230.0,
 'pickup_hour': 459645.0,
 'pickup_minute': 668985.0,
 'latitude_difference': 643347.0,
 'longitude_difference': 575467.0,
 'trip_distance': 621785.0,
 'Temp.': 430997.0,
 'Windchill': 222424.0,
 'Humidity': 432688.0,
 'Pressure': 504796.0,
 'Dew Point': 352793.0,
 'Visibility': 111419.0,
 'Wind Dir': 183852.0,
 'Wind Speed': 222174.0,
 'Gust Speed': 97176.0,
 'Precip': 28473.0,
 'Conditions': 68797.0}

In [61]:
#This is not very telling, so let's scale the features
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'passenger_count': 0.03414942256415522,
 'pickup_longitude': 0.09844505162986265,
 'pickup_latitude': 0.08916613638686577,
 'dropoff_longitude': 0.08426046338996926,
 'dropoff_latitude': 0.07835879068886761,
 'store_and_fwd_flag': 0.0009046235845532909,
 'pickup_month': 0.02145699693795797,
 'pickup_day': 0.03896833534847508,
 'pickup_weekday': 0.02442003247149324,
 'pickup_hour': 0.04329956341997265,
 'pickup_minute': 0.06301984887143428,
 'latitude_difference': 0.060604693247069255,
 'longitude_difference': 0.05421024891514409,
 'trip_distance': 0.05857350572961241,
 'Temp.': 0.04060085921813128,
 'Windchill': 0.02095282684272427,
 'Humidity': 0.04076015511331817,
 'Pressure': 0.04755288628430314,
 'Dew Point': 0.03323387152611779,
 'Visibility': 0.010495913273700211,
 'Wind Dir': 0.01731926015487781,
 'Wind Speed': 0.020929276296422248,
 'Gust Speed': 0.009154191549781381,
 'Precip': 0.002682218819429954,
 'Conditions': 0.006480827735760987}

In [62]:
filename = "xgb_model.sav"
pickle.dump(gbm, open(filename, 'wb'))