In [1]:
import pandas as pd
import random

In [2]:
sample_frac = 0.01

selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
dtypes = {
    'fare_amount': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'passenger_count': 'float32'
}
def skip_row(row_idx):
    if row_idx == 0:
        return False
    return random.random() > sample_frac

random.seed(42)
df = pd.read_csv("train.csv", 
                 usecols=selected_cols, 
                 dtype=dtypes, 
                 parse_dates=['pickup_datetime'], 
                 skiprows=skip_row)

In [3]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0
1,8.0,2013-01-17 17:22:00+00:00,0.0,0.0,0.0,0.0,2.0
2,8.9,2011-06-15 18:07:00+00:00,-73.99633,40.753223,-73.978897,40.766963,3.0
3,6.9,2009-12-14 12:33:00+00:00,-73.98243,40.745747,-73.98243,40.745747,1.0
4,7.0,2013-11-06 11:26:54+00:00,-73.959061,40.781059,-73.962059,40.768604,1.0


In [4]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [5]:
test_df = pd.read_csv("test.csv",dtype=dtypes, parse_dates=['pickup_datetime'])

In [6]:
test_df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1.0
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0


In [7]:
test_df.dtypes

key                               object
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float32
pickup_latitude                  float32
dropoff_longitude                float32
dropoff_latitude                 float64
passenger_count                  float32
dtype: object

In [8]:
len(df)

552450

In [9]:
len(test_df)

9914

In [10]:
# EXPLORE Dataset
df["passenger_count"] = df["passenger_count"].astype(int)
df.dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552450 entries, 0 to 552449
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   fare_amount        552450 non-null  float32            
 1   pickup_datetime    552450 non-null  datetime64[ns, UTC]
 2   pickup_longitude   552450 non-null  float32            
 3   pickup_latitude    552450 non-null  float32            
 4   dropoff_longitude  552450 non-null  float32            
 5   dropoff_latitude   552450 non-null  float64            
 6   passenger_count    552450 non-null  int32              
dtypes: datetime64[ns, UTC](1), float32(4), float64(1), int32(1)
memory usage: 19.0 MB


In [11]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,552450.0,552450.0,552450.0,552450.0,552450.0,552450.0
mean,11.354059,-72.497063,39.9105,-72.504326,39.934257,1.684983
std,9.811924,11.618246,8.061114,12.074346,9.255058,1.337664
min,-52.0,-1183.362793,-3084.490234,-3356.729736,-2073.150613,0.0
25%,6.0,-73.99202,40.734875,-73.991425,40.733988,1.0
50%,8.5,-73.981819,40.752621,-73.980179,40.753102,1.0
75%,12.5,-73.967155,40.767036,-73.963737,40.76806,2.0
max,499.0,2420.209473,404.983337,2467.752686,3351.403027,208.0


# Some observations from the data 

#### by looking at the data , we can see the range of the fare amount - $6 to $12.50 (see the 25% , 50% , 75%) . also the max fare amount $499. and min is -$52 (which is not possible , it needs some data cleaning)
### IMPORTANT : 50% of the rides cost less than $8.5 and 75% of the rides cost $12.5. so when predicting taxi fares, the o/p or answer should be in the +/- $3 range , if not we could be way off on our predictions
### pickup longitude & latitude has negative values , which cannot actually be right, hence this requires some data cleaning
### passenger count ranges from 0 to 208 , which again is not right , hence data cleaning is required.
### There are no missing data




In [12]:
# let's check the date time once
df['pickup_datetime'].min(),df['pickup_datetime'].max()

(Timestamp('2009-01-01 00:11:46+0000', tz='UTC'),
 Timestamp('2015-06-30 23:59:54+0000', tz='UTC'))

In [13]:
test_df.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974716,40.751041,-73.973656,40.751743,1.671273
std,0.042774,0.033541,0.039072,0.035435,1.278747
min,-74.25219,40.573143,-74.263245,40.568973,1.0
25%,-73.9925,40.736125,-73.991249,40.735254,1.0
50%,-73.982327,40.753052,-73.980015,40.754065,1.0
75%,-73.968012,40.767113,-73.964062,40.768757,2.0
max,-72.986534,41.709557,-72.990967,41.696683,6.0


In [14]:
#some observations on the test dataset
#1. no missing value
#2. 1 to 6 passengers
#3. Latitudes 40 - 42
#4. longitudes 
#5. IMPORTANT : the test df is considered to be actual actual real data, and when the predictions are made , they are compared with the test data to get the accuracy level 
# be mindful of the test dataset

In [15]:
# let's check the date time of the test dataset once
test_df['pickup_datetime'].min(),test_df['pickup_datetime'].max()

(Timestamp('2009-01-01 11:04:24+0000', tz='UTC'),
 Timestamp('2015-06-30 20:03:50+0000', tz='UTC'))

#### interesting component here is that the start and end date is the same for testing and training dataset 

# EXPLORATORY DATA ANALYSIS & VISUALISATION 

### Let's get some answers for the questions from the dataset
What is the busiest day of the week? 

What is the busiest time of the day?

In which month are fares the highest?

Which pickup locations have the highest fares?

Which drop locations have the highest fares?

What is the average ride distance?

In [16]:
df['pickup_datetime']

0        2014-12-06 20:36:22+00:00
1        2013-01-17 17:22:00+00:00
2        2011-06-15 18:07:00+00:00
3        2009-12-14 12:33:00+00:00
4        2013-11-06 11:26:54+00:00
                    ...           
552445   2014-02-06 23:59:45+00:00
552446   2015-01-05 15:29:08+00:00
552447   2013-02-17 22:27:00+00:00
552448   2013-01-27 12:41:00+00:00
552449   2014-10-18 07:51:00+00:00
Name: pickup_datetime, Length: 552450, dtype: datetime64[ns, UTC]

In [17]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

#   PREPARE DATASET FOR TRAINING 

##### SPLITING THE DATAFRAME INTO TRAINING AND VALIDATION SET

In [18]:
from sklearn.model_selection import train_test_split

train_df , val_df = train_test_split(df , test_size=0.2 , random_state= 42)

len(train_df) , len(val_df)

(441960, 110490)

In [19]:
# removing all the empty rows or rows which has a missing value 
train_df = train_df.dropna()
val_df = val_df.dropna()

len(train_df) , len(val_df)

(441960, 110490)

In [20]:
#extract input and output
df.columns
input_cols = [ 'pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'passenger_count']
target_cols = ['fare_amount']


In [21]:
train_inputs = train_df[input_cols]
train_target = train_df[target_cols]

In [22]:
val_inputs = val_df[input_cols]
val_target = val_df[target_cols]

In [23]:
#Test
test_inputs = test_df[input_cols]
#test_target = test_df[target_cols]
test_inputs


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,-73.973320,40.763805,-73.981430,40.743835,1.0
1,-73.986862,40.719383,-73.998886,40.739201,1.0
2,-73.982521,40.751259,-73.979652,40.746139,1.0
3,-73.981163,40.767807,-73.990448,40.751635,1.0
4,-73.966049,40.789776,-73.988564,40.744427,1.0
...,...,...,...,...,...
9909,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,-73.945511,40.803600,-73.960213,40.776371,6.0
9911,-73.991600,40.726608,-73.789742,40.647011,6.0
9912,-73.985573,40.735432,-73.939178,40.801731,6.0


# Train Hard Coded & Baseline Models

#### something that you should always do !!
#### Always create a hardcoded or a baseline model(linear regression) to establish the minimum score that the proper ML should match/beat with

In [24]:
import numpy as np

In [25]:
class MeanRegression : # hard coded model , that gives just the mean/average value 
    def fit(self,inputs, target) : 
        self.mean = target.mean()
        print(self.mean)
    
    def predict(self, input):
        return np.full(input.shape[0],self.mean)



In [26]:
MRmodel = MeanRegression()
MRmodel.fit(train_inputs,train_target)
train_preds = MRmodel.predict(train_inputs)


train_preds


fare_amount    11.354714
dtype: float32


array([11.354714, 11.354714, 11.354714, ..., 11.354714, 11.354714,
       11.354714], dtype=float32)

In [27]:
from sklearn.metrics import mean_squared_error

def rootMeanSquareError(targets, pred) :
    return mean_squared_error(targets, pred, squared= False)

In [28]:
train_rmse = rootMeanSquareError(train_target,train_preds)
train_rmse

#this basically tells that our prediction is +/- 9 around the target value ,  but this is way off ! refer to "some observations on the data" , the data should only
# be +/- 3 around the data , anything more that that , model should be tweaked

#Conclusion  : our lazy hard-coded model is OFF by $9.899 on a average, which is pretty bad considering the average fair is $11.35


#but this gives us a base line value , so any model we train should have an rmse lower that'9.789782' 

9.789782

# Train & Evaluate Baseline Model
### i'm going to use linear regression to express the baseline value , so can keep that as a comparison when we create rest of the model

In [29]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()


In [30]:
linear_model.fit(train_inputs,train_target)


LinearRegression()

In [31]:
train_preds_LM=linear_model.predict(train_inputs)
train_preds_LM

array([[11.54623682],
       [11.28460949],
       [11.28413941],
       ...,
       [11.45891673],
       [11.28428005],
       [11.28444733]])

In [32]:
#check rmse
train_rmse_LM = rootMeanSquareError(train_target,train_preds_LM)
train_rmse_LM


9.78863266989743

### the linear regression is off by $9.89, which isn't much better than simply predicting the average.
### this is mainly because the training data(geo coordinates) is not in a format that's useful for the model, and we're not using one of the most
### important columns : pickup date & time.
### however , now we have a baseline that our other models should ideally beat 

# FEATURE ENGINEERING 

create an iterative approach to feature engineering. Add some features , train a model , evaluate it, keep the features if they help ,else drop
them . REPEAT. 

- Extract parts of data 
- Remove outliers & invalid data
- Add distance between pickup and dropoff
- Add distance from landmarks

So im gonna apply all of them together , but i will take a note of the effects of adding each feature individually

## Extract Parts of Date 
- Year 
- Month
- Day 
- Weekday
- Hour

In [33]:
def add_dateparts(df , col) :
    df[col + '_year'] = df[col].dt.year
    df[col + '_month'] = df[col].dt.month
    df[col + '_day'] = df[col].dt.day
    df[col + '_weekday'] = df[col].dt.weekday
    df[col + '_hour'] = df[col].dt.hour



In [34]:
add_dateparts(train_df,'pickup_datetime')
train_df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day',
       'pickup_datetime_weekday', 'pickup_datetime_hour'],
      dtype='object')

In [35]:
add_dateparts(val_df,'pickup_datetime')
val_df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day',
       'pickup_datetime_weekday', 'pickup_datetime_hour'],
      dtype='object')

In [36]:
add_dateparts(test_df,'pickup_datetime')
test_df.columns

Index(['key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day',
       'pickup_datetime_weekday', 'pickup_datetime_hour'],
      dtype='object')

In [37]:
train_df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
353352,6.0,2015-04-12 03:40:38+00:00,-73.993652,40.741543,-73.977974,40.742352,4,2015,4,12,6,3
360070,3.7,2011-01-26 19:21:00+00:00,-73.993805,40.724579,-73.993805,40.724577,1,2011,1,26,2,19
372609,10.0,2012-10-03 10:40:17+00:00,-73.95916,40.78075,-73.969116,40.761231,1,2012,10,3,2,10
550895,8.9,2012-03-14 13:44:27+00:00,-73.952187,40.783951,-73.978645,40.772603,1,2012,3,14,2,13
444151,7.3,2012-02-05 15:33:00+00:00,-73.977112,40.746834,-73.991104,40.750403,2,2012,2,5,6,15


In [38]:
val_df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
15971,14.0,2015-05-19 09:27:24+00:00,-73.995834,40.75919,-73.973679,40.739086,1,2015,5,19,1,9
149839,6.5,2010-04-10 15:07:51+00:00,-73.977386,40.738335,-73.976143,40.751204,1,2010,4,10,5,15
515867,49.57,2009-07-25 14:11:00+00:00,-73.98391,40.74947,-73.78717,40.646643,1,2009,7,25,5,14
90307,49.700001,2011-11-11 19:09:21+00:00,-73.790794,40.643463,-73.972252,40.69018,1,2011,11,11,4,19
287032,8.5,2015-03-09 18:06:44+00:00,-73.976593,40.761944,-73.991463,40.750309,2,2015,3,9,0,18


In [39]:
test_df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1.0,2015,1,27,1,13
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0,2015,1,27,1,13
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0,2011,10,8,5,11
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0,2012,12,1,5,21
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0,2012,12,1,5,21


# ADD DISTANCE BETWEEN PICKUP & DROP
### i'll be using haversine formula to calculate the distance between the 2 points

In [40]:
import numpy as np 
from math import radians, cos, sin, asin, sqrt
# creating a function to find distance using haversine formula 
# a generous soul had it posted in the holy grail( stackoverflow )
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [41]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [42]:
def add_trip_distance_col(df_dist) :

    df_dist['trip distance'] = haversine_np(df_dist['pickup_longitude'], df_dist['pickup_latitude'],df_dist['dropoff_longitude'], df_dist['dropoff_latitude'])

In [43]:
add_trip_distance_col(train_df)
add_trip_distance_col(test_df)
add_trip_distance_col(val_df)

In [44]:
train_df.head()

# checking if distance is added as a column

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip distance
353352,6.0,2015-04-12 03:40:38+00:00,-73.993652,40.741543,-73.977974,40.742352,4,2015,4,12,6,3,1.324241
360070,3.7,2011-01-26 19:21:00+00:00,-73.993805,40.724579,-73.993805,40.724577,1,2011,1,26,2,19,0.000273
372609,10.0,2012-10-03 10:40:17+00:00,-73.95916,40.78075,-73.969116,40.761231,1,2012,10,3,2,10,2.326887
550895,8.9,2012-03-14 13:44:27+00:00,-73.952187,40.783951,-73.978645,40.772603,1,2012,3,14,2,13,2.560493
444151,7.3,2012-02-05 15:33:00+00:00,-73.977112,40.746834,-73.991104,40.750403,2,2012,2,5,6,15,1.243998


In [45]:
test_df.head()


Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip distance
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1.0,2015,1,27,1,13,2.323178
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0,2015,1,27,1,13,2.425412
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0,2011,10,8,5,11,0.618397
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0,2012,12,1,5,21,1.961142
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0,2012,12,1,5,21,5.387314


In [46]:
val_df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip distance
15971,14.0,2015-05-19 09:27:24+00:00,-73.995834,40.75919,-73.973679,40.739086,1,2015,5,19,1,9,2.911674
149839,6.5,2010-04-10 15:07:51+00:00,-73.977386,40.738335,-73.976143,40.751204,1,2010,4,10,5,15,1.434744
515867,49.57,2009-07-25 14:11:00+00:00,-73.98391,40.74947,-73.78717,40.646643,1,2009,7,25,5,14,20.14517
90307,49.700001,2011-11-11 19:09:21+00:00,-73.790794,40.643463,-73.972252,40.69018,1,2011,11,11,4,19,16.162142
287032,8.5,2015-03-09 18:06:44+00:00,-73.976593,40.761944,-73.991463,40.750309,2,2015,3,9,0,18,1.800667


## ANOTHER FEATURE - ADD DISTANCE FROM POPULAR LANDMARKS

### This is where creative engineering comes into play i.e., involving human insight or external data - this indeed more
### efficient than hyperparameter tuning. this kind of features, 1 or 2 can drastically increase the model's performance.

- JFK AIRPORT
- LGA AIRPORT
- EWR AIRPORT
- TIMES SQUARE
- MET Museum
- WORLD TRADE CENTER

In [50]:
jfk_lonlat = -73.7781, 40.6413
lga_lonlat = -73.8740, 40.7769
ewr_lonlat = -74.1745, 40.6895
met_lonlat = -73.9632, 40.7794
wtc_lonlat = -74.0099, 40.7126

# creating a column that has the distance from landmark to drop off location

def add_landmark_2_dropOff_distance(df,landmark_name,landmark_lonlat):

    lon,lat = landmark_lonlat

    df[landmark_name +'_drop_distance'] = haversine_np(lon,lat,df['dropoff_longitude'],df['dropoff_latitude'])
    



In [51]:
def add_landmarks(a_df) :
    landmarks = [('jfk',jfk_lonlat),('lga',lga_lonlat),('ewr',ewr_lonlat),('met',met_lonlat),('wtc',wtc_lonlat)]
    for name,lonlat in landmarks:
        add_landmark_2_dropOff_distance(a_df,name,lonlat)

In [52]:
add_landmarks(train_df)

In [56]:
add_landmarks(test_df)

In [57]:
add_landmarks(val_df)

# REMOVE OUTLIERS AND INVALID DATA

I'm gonna do a df.describe() on the test data , to see the value ranges, since that is the dataset which we're gonna use and make decisions/predictions

In [62]:
test_df.describe()
#look at the lat & long , they are along the right numbers
#passenger count - they look good as well : 1-6


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
count,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974716,40.751041,-73.973656,40.751743,1.671273,2011.815816,6.857979,16.19417,2.852834,13.46742,3.435374,20.929848,9.681287,18.558307,4.51581,6.041414
std,0.042774,0.033541,0.039072,0.035435,1.278747,1.803347,3.353272,8.838482,1.994451,6.868584,3.972377,3.306017,3.297727,4.038354,4.020963,4.255208
min,-74.25219,40.573143,-74.263245,40.568973,1.0,2009.0,1.0,1.0,0.0,0.0,9e-06,0.40231,0.285689,0.284885,0.085782,0.040412
25%,-73.9925,40.736125,-73.991249,40.735254,1.0,2010.0,4.0,9.0,1.0,8.0,1.298096,20.526253,8.316783,16.530847,2.127662,3.672219
50%,-73.982327,40.753052,-73.980015,40.754065,1.0,2012.0,7.0,16.0,3.0,15.0,2.217061,21.194701,9.483746,18.035688,3.700684,5.544899
75%,-73.968012,40.767113,-73.964062,40.768757,2.0,2014.0,10.0,25.0,5.0,19.0,4.045468,21.923545,10.97236,19.89301,5.926481,7.762282
max,-72.986534,41.709557,-72.990967,41.696683,6.0,2015.0,12.0,31.0,6.0,23.0,99.996141,134.582154,126.141683,149.494648,130.428937,138.70654


## I will be using the following ranges :
### - fare_amount: $1 to $500
### -longitudes: -75 to -72
### -latitudes: 40 to 42
### -passenger_count: 1 to 6

In [63]:
def remove_outliers(df):
    return df[(df['fare_amount'] >= 1.) & 
              (df['fare_amount'] <= 500.) &
              (df['pickup_longitude'] >= -75) & 
              (df['pickup_longitude'] <= -72) & 
              (df['dropoff_longitude'] >= -75) & 
              (df['dropoff_longitude'] <= -72) & 
              (df['pickup_latitude'] >= 40) & 
              (df['pickup_latitude'] <= 42) & 
              (df['dropoff_latitude'] >=40) & 
              (df['dropoff_latitude'] <= 42) & 
              (df['passenger_count'] >= 1) & 
              (df['passenger_count'] <= 6)]

In [83]:
train_df = remove_outliers(train_df)

In [85]:
val_df = remove_outliers(val_df)

# TRAIN AND EVALUATE DIFFERENT MODELS
- RIDGE Regression
- Random Forest 
- Gradient Boosting 

In [86]:
train_df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day',
       'pickup_datetime_weekday', 'pickup_datetime_hour', 'trip distance',
       'jfk_drop_distance', 'lga_drop_distance', 'ewr_drop_distance',
       'met_drop_distance', 'wtc_drop_distance'],
      dtype='object')

In [87]:
input_columns = ['pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day',
       'pickup_datetime_weekday', 'pickup_datetime_hour', 'trip distance',
       'jfk_drop_distance', 'lga_drop_distance', 'ewr_drop_distance',
       'met_drop_distance', 'wtc_drop_distance']

# I have removed the fare amount and pickup date time , but i have kept the long and latitude ,
# cause decision trees might still be able to make a use of it

target_columns = ['fare_amount']



In [88]:

train_inputs = train_df[input_columns]
train_target = train_df[target_columns]

In [89]:
val_inputs =val_df[input_columns]
val_target = val_df[target_columns]

In [90]:
test_inputs =test_df[input_columns]


In [91]:
def evaluate(model):
    train_preds = model.predict(train_inputs)
    train_rmse = mean_squared_error(train_target, train_preds, squared=False)
    val_preds = model.predict(val_inputs)
    val_rmse = mean_squared_error(val_target, val_preds, squared=False)
    return train_rmse, val_rmse, train_preds, val_preds

## Ridge Regression 

In [92]:
from sklearn.linear_model import Ridge

In [93]:
model1 = Ridge(random_state=42,alpha = 0.9)
model1.fit(train_inputs,train_target)


Ridge(alpha=0.9, random_state=42)

In [96]:
evaluate(model1)

# we can see the rmse value has gone down to 5 ! (if we hadn't done the 'remove outliers' it would've been around 8. something)

(5.0493117450649665,
 5.217843123036618,
 array([[ 8.12924627],
        [ 4.1166401 ],
        [ 8.75005422],
        ...,
        [10.47163755],
        [ 8.23065526],
        [10.58695341]]),
 array([[10.91928971],
        [ 6.20511414],
        [46.21894107],
        ...,
        [ 8.04627004],
        [25.56764753],
        [ 8.45250096]]))

# RANDOM FOREST

In [97]:
from sklearn.ensemble import RandomForestRegressor

In [98]:
model2 = RandomForestRegressor(random_state=42,n_jobs=-1,max_depth=10,n_estimators=100)

In [99]:
%%time 
model2.fit(train_inputs,train_target)



Wall time: 59.1 s


RandomForestRegressor(max_depth=10, n_jobs=-1, random_state=42)

In [100]:
evaluate(model2)

(3.590397581435513,
 4.160639011564272,
 array([ 6.99165169,  9.20492545,  9.08617526, ..., 10.42907254,
         7.78595196, 10.4215362 ]),
 array([12.61176441,  6.14724477, 47.31186835, ...,  8.37078062,
        29.24303523,  8.22677944]))

# GRADIENT BOOSTING 

In [106]:
from xgboost import XGBRegressor

In [107]:
model3 = XGBRegressor(max_depth = 5, objective='reg:squarederror',n_estimators = 300, random_state = 42,
n_jobs = -1)

In [108]:
model3.fit(train_inputs,train_target)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=300, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [109]:
evaluate(model3)

(2.922757,
 3.9585629,
 array([ 6.442752,  7.21128 , 10.002452, ..., 12.044368,  9.187529,
        10.274479], dtype=float32),
 array([14.856599 ,  5.8981895, 48.475555 , ...,  7.9378333, 30.402485 ,
         8.751183 ], dtype=float32))

# Tune Hyperparameters

In [115]:
xgb_model_tuned = XGBRegressor(max_depth = 8, objective='reg:squarederror',n_estimators = 700, random_state = 42,
n_jobs = -1,learning_rate=0.1,subsample=0.8, colsample_bytree=0.8)
xgb_model_tuned.fit(train_inputs,train_target)
evaluate(xgb_model_tuned)

(2.2944312,
 3.8969011,
 array([ 6.647538,  6.914425,  9.921077, ..., 11.405403,  9.519929,
         9.965168], dtype=float32),
 array([14.3337755,  5.7827635, 48.45697  , ...,  7.8154483, 30.354889 ,
         8.586214 ], dtype=float32))

In [117]:
test_preds = xgb_model_tuned.predict(test_inputs)
sub_df = pd.read_csv('sample_submission.csv')
sub_df['fare_amount'] = test_preds
sub_df.to_csv('xgb_model_tuned_submission_final.csv', index=None)


In [118]:
# kaggle leaderboard - top 28%