In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib as matplotlib
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [2]:
train=pd.read_csv("train.csv",nrows=1000000)
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [3]:
test=pd.read_csv("test.csv")
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [4]:
train.shape,test.shape

((1000000, 8), (9914, 7))

In [5]:
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,1000000.0,1000000.0,1000000.0,999990.0,999990.0,1000000.0
mean,11.348079,-72.52664,39.929008,-72.52786,39.919954,1.684924
std,9.82209,12.057937,7.626154,11.324494,8.201418,1.323911
min,-44.9,-3377.680935,-3116.285383,-3383.296608,-3114.338567,0.0
25%,6.0,-73.99206,40.734965,-73.991385,40.734046,1.0
50%,8.5,-73.981792,40.752695,-73.980135,40.753166,1.0
75%,12.5,-73.967094,40.767154,-73.963654,40.768129,2.0
max,500.0,2522.271325,2621.62843,45.581619,1651.553433,208.0


In [6]:
train.isnull().sum()

key                   0
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    10
dropoff_latitude     10
passenger_count       0
dtype: int64

In [7]:
test.isnull().sum()

key                  0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [8]:
train = train.drop(train[train.isnull().any(1)].index, axis = 0)

In [9]:
train.shape

(999990, 8)

In [10]:
train['fare_amount'].describe()

count    999990.000000
mean         11.347953
std           9.821790
min         -44.900000
25%           6.000000
50%           8.500000
75%          12.500000
max         500.000000
Name: fare_amount, dtype: float64

In [11]:
#train[train['fare_amount']<0].count()
from collections import Counter
Counter(train['fare_amount']<0)

Counter({False: 999952, True: 38})

In [12]:
train = train.drop(train[train['fare_amount']<0].index, axis=0)
train.shape

(999952, 8)

In [13]:
train['fare_amount'].describe()

count    999952.000000
mean         11.348616
std           9.821249
min           0.000000
25%           6.000000
50%           8.500000
75%          12.500000
max         500.000000
Name: fare_amount, dtype: float64

Next check the passenger_count variable

In [14]:
train['passenger_count'].describe()

count    999952.000000
mean          1.684942
std           1.323908
min           0.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         208.000000
Name: passenger_count, dtype: float64

In [15]:
train[train['passenger_count']>6]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
929022,2009-07-30 11:54:00.000000193,3.3,2009-07-30 11:54:00 UTC,0.0,0.0,0.0,0.0,208


In [16]:
train = train.drop(train[train['passenger_count']==208].index, axis = 0)

In [17]:
train['passenger_count'].describe()

count    999951.000000
mean          1.684736
std           1.307733
min           0.000000
25%           1.000000
50%           1.000000
75%           2.000000
max           6.000000
Name: passenger_count, dtype: float64

### Next, let us explore the pickup latitude and longitudes

In [18]:
train['pickup_latitude'].describe()

count    999951.000000
mean         39.929090
std           7.626025
min       -3116.285383
25%          40.734965
50%          40.752695
75%          40.767154
max        2621.628430
Name: pickup_latitude, dtype: float64

Quick Googling gave me this info
* Latitudes range from -90 to 90.
* Longitudes range from -180 to 180.

The above describe clearly shows some outliers. Let's filter them

In [19]:
train[(train['pickup_latitude']>90) | (train['pickup_latitude']<-90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
5686,2011-07-30 11:15:00.00000082,3.3,2011-07-30 11:15:00 UTC,-73.947235,401.083332,-73.951392,40.778927,1
150559,2012-08-03 07:43:00.000000176,25.3,2012-08-03 07:43:00 UTC,0.0,-3116.285383,-73.9536,40.787998,1
174356,2011-11-21 21:36:00.00000081,9.7,2011-11-21 21:36:00 UTC,2140.60116,1703.092772,-1251.19589,-1189.61544,1
272439,2011-04-23 02:55:00.00000012,9.3,2011-04-23 02:55:00 UTC,-74.002497,405.35,-73.9786,40.739962,1
436233,2012-03-11 01:56:00.000000100,4.1,2012-03-11 01:56:00 UTC,-2986.242495,-880.627428,-3383.296608,-2559.748913,1
464025,2012-03-05 20:58:00.0000009,6.9,2012-03-05 20:58:00 UTC,-73.994268,404.966667,-73.973857,40.755457,1
505229,2012-03-05 21:08:00.0000006,9.3,2012-03-05 21:08:00 UTC,-73.994268,404.966667,-73.986387,40.776895,1
505583,2012-03-16 08:13:00.000000272,8.5,2012-03-16 08:13:00 UTC,-73.994277,405.133332,-73.956763,40.783737,1
543001,2011-10-28 13:53:00.000000178,5.3,2011-10-28 13:53:00 UTC,-3377.680935,-113.019497,-554.918693,314.79418,1
688685,2012-03-22 21:40:00.000000188,17.3,2012-03-22 21:40:00 UTC,2522.271325,2621.62843,-1718.117653,-2864.471003,1


In [20]:
#We need to drop these outliers
train = train.drop(((train[train['pickup_latitude']<-90])|(train[train['pickup_latitude']>90])).index, axis=0)

In [21]:
train.shape

(999939, 8)

In [22]:
#similar operation for pickup longitude
train['pickup_longitude'].describe()

count    999939.000000
mean        -72.525443
std          10.692752
min       -1452.988333
25%         -73.992060
50%         -73.981792
75%         -73.967095
max          40.850357
Name: pickup_longitude, dtype: float64

In [23]:
train[(train['pickup_longitude']<-180) | (train['pickup_longitude']<-180)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
60442,2012-01-12 13:36:00.000000186,4.9,2012-01-12 13:36:00 UTC,-736.55,40.73823,-73.988742,40.748847,1
217355,2012-06-03 23:21:00.00000077,6.1,2012-06-03 23:21:00 UTC,-740.0,40.74762,0.0,0.0,6
243342,2012-08-02 10:38:00.000000111,7.3,2012-08-02 10:38:00 UTC,-736.333333,40.76648,-73.987928,40.751742,3
351119,2012-02-03 07:53:00.000000135,4.1,2012-02-03 07:53:00 UTC,-736.483332,40.766512,-73.981992,40.771672,1
370663,2012-05-04 23:19:00.000000261,6.1,2012-05-04 23:19:00 UTC,-736.516667,40.719095,-74.003952,40.72986,1
416859,2012-07-21 15:23:00.00000093,6.5,2012-07-21 15:23:00 UTC,-736.416665,40.752285,-73.992565,40.742687,1
568830,2013-05-23 15:23:00.0000007,11.5,2013-05-23 15:23:00 UTC,-735.888333,40.760863,-73.989835,40.738443,5
675500,2011-10-28 10:53:00.00000080,16.9,2011-10-28 10:53:00 UTC,-1452.988333,40.776,-73.989487,40.740667,2
837619,2012-07-17 08:38:00.000000116,39.7,2012-07-17 08:38:00 UTC,-736.0,40.758507,0.0,0.0,5
914435,2012-03-04 00:10:00.000000131,11.7,2012-03-04 00:10:00 UTC,-736.25,40.788027,-73.990812,40.750942,1


In [24]:
train = train.drop(((train[train['pickup_longitude']<-180])|(train[train['pickup_longitude']>180])).index, axis=0)

In [25]:
train.shape

(999928, 8)

In [26]:
#similar operation for dropoff latitude and longitude
train[(train['dropoff_latitude']<-90) | (train['dropoff_latitude']>90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
92310,2011-09-27 11:54:00.000000127,28.9,2011-09-27 11:54:00 UTC,-74.014595,40.68188,-73.97331,404.616667,1
181973,2012-01-03 09:04:00.000000130,6.5,2012-01-03 09:04:00 UTC,-74.008918,40.717827,-74.000855,404.133332,1
335675,2012-02-26 00:25:00.00000085,19.3,2012-02-26 00:25:00 UTC,-74.006457,40.743865,-73.981283,404.133332,1
561910,2010-08-14 03:42:59.0000001,4.5,2010-08-14 03:42:59 UTC,-73.977635,40.754687,-73.870432,405.65,1
579901,2013-08-14 20:23:00.000000245,18.0,2013-08-14 20:23:00 UTC,-73.99919,40.7202,-0.36,-3114.338567,1
582110,2012-05-02 14:20:00.000000131,30.5,2012-05-02 14:20:00 UTC,-73.870875,40.773792,-73.97779,404.716667,1
748464,2013-01-02 11:33:00.00000020,15.0,2013-01-02 11:33:00 UTC,-74.0152,40.709665,-40.719295,1651.553433,5
889704,2012-05-21 12:15:00.00000098,15.3,2012-05-21 12:15:00 UTC,-74.001292,40.72756,-73.953047,404.466667,1


In [27]:
train = train.drop(((train[train['dropoff_latitude']<-90])|(train[train['dropoff_latitude']>90])).index, axis=0)

In [28]:
train.shape

(999920, 8)

In [29]:
train[train['dropoff_latitude']<-180]|train[train['dropoff_latitude']>180]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count


In [None]:
train.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [None]:
train['key'] = pd.to_datetime(train['key'])
train['pickup_datetime']  = pd.to_datetime(train['pickup_datetime'])
test['key'] = pd.to_datetime(test['key'])
test['pickup_datetime']  = pd.to_datetime(test['pickup_datetime'])

In [None]:
train.dtypes

## EDA

Now, for EDA. The following are my considerations -

* Does the number of passengers affect the fare?
* Does the date and time of pickup affect the fare?
* Does the day of the week affect the fare?
* Does the distance travelled affect the fare?

First, let's split the datetime field 'pickup_datetime' to the following -

* year
* month
* date
* hour
* day of week

Using these we shall calculate the day of the week and come to our conclusions about how pickup_location affects the fare.
###### create a new field 'distance' to fetch the distance between the pickup and the drop.

Finding distances based on Latitude and Longitude.

The haversine formula determines the great-circle distance between two points on a sphere given their longitudes and latitudes

Eventually, the formual boils down to the following where φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km) to include latitude and longitude coordinates (A and B in this case).

a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)

c = 2 * atan2( √a, √(1−a) )

d = R ⋅ c

d = Haversine distance

##### Formula

dlon = lon2 - lon1

dlat = lat2 - lat1

a = (sin(dlat/2))^2 + cos(lat1) cos(lat2) (sin(dlon/2))^2

c = 2 * atan2( sqrt(a), sqrt(1-a) )

d = R * c (where R is the radius of the Earth)

In [None]:
def haversine_distance(pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude):
    data = [train, test]
    for i in data:
        pickup_lat  = np.radians(i["pickup_latitude"])
        pickup_lon  = np.radians(i["pickup_longitude"])
        dropoff_lat = np.radians(i["dropoff_latitude"])
        dropoff_lon = np.radians(i["dropoff_longitude"])

        dist_lon = dropoff_lon - pickup_lon
        dist_lat = dropoff_lat - pickup_lat

#Formula
        R = 6371  #radius of earth in kilometers
        a = (np.sin(dist_lat/2))**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * (np.sin(dist_lon/2))**2 
        c = 2 * np.arctan2( np.sqrt(a), np.sqrt(1-a) ) 
        d = R * c #(where R is the radius of the Earth)
        i['H_Distance'] = d
    return d
    

In [None]:
haversine_distance('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [None]:
train.H_Distance.describe()

In [None]:
Counter(train['H_Distance']>3)

In [None]:
train.H_Distance.head()

In [None]:
test.H_Distance.head()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
data = [train,test]
for i in data:
    i['Year'] = i['pickup_datetime'].dt.year
    i['Month'] = i['pickup_datetime'].dt.month
    i['Date'] = i['pickup_datetime'].dt.day
    i['Day of Week'] = i['pickup_datetime'].dt.dayofweek
    i['Hour'] = i['pickup_datetime'].dt.hour

In [None]:
train.head()

In [None]:
test.head()

### 1. Does the number of passengers affect the fare?

In [None]:
plt.figure(figsize=(15,7))
plt.hist(train['passenger_count'], bins=15)
plt.xlabel('No. of Passengers')
plt.ylabel('Frequency')

In [None]:
plt.figure(figsize=(15,7))
plt.scatter(x=train['passenger_count'], y=train['fare_amount'], s=1.5)
plt.xlabel('No. of Passengers')
plt.ylabel('Fare')

From the above 2 graphs we can see that single passengers are the most frequent travellers, and the highest fare also seems to come from cabs which carry just 1 passenger.

### 2. Does the date and time of pickup affect the fare?

In [None]:
plt.figure(figsize=(15,7))
plt.scatter(x=train['Date'], y=train['fare_amount'], s=1.5)
plt.xlabel('Date')
plt.ylabel('Fare')

In [None]:
plt.figure(figsize=(15,7))
plt.hist(train['Hour'], bins=100)
plt.xlabel('Hour')
plt.ylabel('Frequency')

The time of day definitely plays an important role. The frequency of cab rides seem to be the lowest at 5AM and the highest at 7PM.

In [None]:
plt.figure(figsize=(15,7))
plt.scatter(x=train['Hour'], y=train['fare_amount'])
plt.xlabel('Hour')
plt.ylabel('Fare')

The fares, however, seem to be high betweeb 5AM and 10AM, and 1PM to 4PM. Maybe people who live far away prefer to leave earlier to avoid rush hour traffic

### 3. Does the day of the week affect the fare

In [None]:
plt.figure(figsize=(15,7))
plt.hist(train['Day of Week'], bins=100)
plt.xlabel('Day of Week')
plt.ylabel('Frequency')

Day of the week doesn't seem to have that much of an influence on the number of cab rides

In [None]:
plt.figure(figsize=(15,7))
plt.scatter(x=train['Day of Week'], y=train['fare_amount'])
plt.xlabel('Day of Week')
plt.ylabel('Fare')

The highest fares seem to be on a Sunday and Monday, and the lowest on Wednesday and Friday. Maybe people travel far distances on Sunday and Monday (visiting family and returning back home), and hence, the high fares. And guess people just want to stay at home on a Friday

In [None]:
bins_0 = train.loc[(train['H_Distance'] == 0), ['H_Distance']]
bins_1 = train.loc[(train['H_Distance'] > 0) & (train['H_Distance'] <= 10),['H_Distance']]
bins_2 = train.loc[(train['H_Distance'] > 10) & (train['H_Distance'] <= 50),['H_Distance']]
bins_3 = train.loc[(train['H_Distance'] > 50) & (train['H_Distance'] <= 100),['H_Distance']]
bins_4 = train.loc[(train['H_Distance'] > 100) & (train['H_Distance'] <= 200),['H_Distance']]
bins_5 = train.loc[(train['H_Distance'] > 200) & (train['H_Distance'] <= 300),['H_Distance']]
bins_6 = train.loc[(train['H_Distance'] > 300),['H_Distance']]

In [None]:
bins_0['bins'] = '0'
bins_1['bins'] = '0-10'
bins_2['bins'] = '11-50'
bins_3['bins'] = '51-100'
bins_4['bins'] = '100-200'
bins_5['bins'] = '201-300'
bins_6['bins'] = '>300'

In [None]:
dist_bins =pd.concat([bins_0,bins_1,bins_2,bins_3,bins_4,bins_5,bins_6])

In [None]:
dist_bins.columns

In [None]:
dist_bins.head()

In [None]:
Counter(dist_bins['bins'])

There are values which are greater than 100 kms! In NYC I am not sure why people would take cabs to travel more than a 100 kms. Since the number of bins for 100-200 kms is quite high, I will keep these. 

These outliers could be because of typos or missing values in the latitude or longitude.

Remove fields of the following

* Pickup latitude and pickup longitude are 0 but dropoff latitude and longitude are not 0, but the fare is 0
* vice versa of point 1.
* Pickup latitude and pickup longitude are 0 but dropoff latitude and longitude are not 0, but the fare is NOT 0. Here I will have to impute the distance values in both the train and test data.

In [None]:
#pickup latitude and longitude = 0
train.loc[((train['pickup_latitude']==0) & (train['pickup_longitude']==0))&((train['dropoff_latitude']!=0) & (train['dropoff_longitude']!=0)) & (train['fare_amount']==0)]

In [None]:
train = train.drop(train.loc[((train['pickup_latitude']==0) & (train['pickup_longitude']==0))&((train['dropoff_latitude']!=0) & (train['dropoff_longitude']!=0)) & (train['fare_amount']==0)].index, axis=0)

In [None]:
train.shape

In [None]:
#Check in test data
test.loc[((test['pickup_latitude']==0) & (test['pickup_longitude']==0))&((test['dropoff_latitude']!=0) & (test['dropoff_longitude']!=0))]

In [None]:
#dropoff latitude and longitude = 0
train.loc[((train['pickup_latitude']!=0) & (train['pickup_longitude']!=0))&((train['dropoff_latitude']==0) & (train['dropoff_longitude']==0)) & (train['fare_amount']==0)]

In [None]:
train = train.drop(train.loc[((train['pickup_latitude']!=0) & (train['pickup_longitude']!=0))&((train['dropoff_latitude']==0) & (train['dropoff_longitude']==0)) & (train['fare_amount']==0)].index, axis=0)

In [None]:
train.shape

In [None]:
#Checking test data

test.loc[((test['pickup_latitude']!=0) & (test['pickup_longitude']!=0))&((test['dropoff_latitude']==0) & (test['dropoff_longitude']==0))]

Check the H_Distance fields which are greater than 200 kms cause there is no way that people would travel more than 200 kms at the most in NYC in a CAB!

In [None]:
high_distance = train.loc[(train['H_Distance']>200)&(train['fare_amount']!=0)]

In [None]:
high_distance

In [None]:
high_distance.shape

However, since all these values have fares, I do not wish to drop them as they contain crucial data. Instead, I will replace the initial distance values with distance values calculated using the fare using the following formula

distance = (fare_amount - 2.5)/1.56

In [None]:
high_distance['H_Distance'] = high_distance.apply(lambda row: (row['fare_amount'] - 2.50)/1.56,axis=1)

In [None]:
#The distance values have been replaced by the newly calculated ones according to the fare
high_distance

In [None]:
#sync the train data with the newly computed distance values from high_distance dataframe
train.update(high_distance)

In [None]:
train.shape

Now we shall check for rows where the distance values are 0

In [None]:
train[train['H_Distance']==0]

We can see a few rows with distance =0. This could be due to 2 reasons

1. The cab waited the whole time and the passenger eventually cancelled. That's why the pickup and drop co-ordinates are the same and maybe, the passenger was charged for the waiting time.
2. The pickup and drop co-ordinates were not entered. In other words, these are missing values!

28667 rows are too many rows to be deleted. We need to impute these missing values. I have a plan. I intend to impute the missing distance values with the fare and average price per kilometer of NYC cabs.

A quick Google search gave me the following prices -

* $$2.5 base-price + $1.56/km --> 6AM to 8PM Mon-Fri

* $$3.0 base-price + $1.56/km --> 8PM to 6AM Mon-Fri and Sat&Sun

However, before we proceed with the above steps, lets check for the following scenarios to impute the missing fare amount and the H_Distance in train data.

#### SCENARIO 1

Fare and Distance are both 0. According to the table above, we shall delete them as they do not provide us any info with regards to the data.

In [None]:
train[(train['H_Distance']==0)&(train['fare_amount']==0)]

In [None]:
train = train.drop(train[(train['H_Distance']==0)&(train['fare_amount']==0)].index, axis = 0)

In [None]:
train[(train['H_Distance']==0)].shape

#### SCENARIO 2

Fare is not 0 and is less than the base amount, but Distance is 0.

Delete these rows as the minimum is $2.50, and these fares are incorrect values.

In [None]:
#Between 6AM and 8PM on Mon-Fri
rush_hour = train.loc[(((train['Hour']>=6)&(train['Hour']<=20)) & ((train['Day of Week']>=1) & (train['Day of Week']<=5)) & (train['H_Distance']==0) & (train['fare_amount'] < 2.5))]
rush_hour

In [None]:
train=train.drop(rush_hour.index, axis=0)

In [None]:
#Between 8PM and 6AM on Mon-Fri
non_rush_hour = train.loc[(((train['Hour']<6)|(train['Hour']>20)) & ((train['Day of Week']>=1)&(train['Day of Week']<=5)) & (train['H_Distance']==0) & (train['fare_amount'] < 3.0))]
#print(Counter(non_work_hours['Hour']))
#print(Counter(non_work_hours['Day of Week']))
non_rush_hour

In [None]:
Counter(non_rush_hour['fare_amount']<2.5)

Since the fare_amount is not <2.5 (which is the base fare), these values seem legit to me.

In [None]:
#Saturday and Sunday all hours
weekends = train.loc[((train['Day of Week']==0) | (train['Day of Week']==6)) & (train['H_Distance']==0) & (train['fare_amount'] < 3.0)]
weekends

In [None]:
#Counter(weekends['fa']<2.5)
Counter(weekends['fare_amount']<2.5)

#### SCENARIO 3

Fare is 0, but Distance is not 0. These values need to be imputed.

we can calculate the fare as I have the distance. I shall use the following formula

fare = 2.5 + 1.56(H_Distance)

In [None]:
scenario_3 =train.loc[(train['H_Distance']!=0) & (train['fare_amount']==0)]

In [None]:
len(scenario_3)

In [None]:
scenario_3['fare_amount'] = scenario_3.apply(lambda row: ((row['H_Distance'] * 1.56) + 2.50), axis=1)

In [None]:
scenario_3['fare_amount']

In [None]:
train.update(scenario_3)

In [None]:
train.shape

#### SCENARIO 4

Fare is not 0, but Distance is 0. These values need to be imputed.

In [None]:
scenario_4=train.loc[(train['H_Distance']==0) & (train['fare_amount']!=0)]
scenario_4

In [None]:
len(scenario_4)

In [None]:
#Using our prior knowledge about the base price during weekdays and weekends for the cabs.
#I do not want to impute these 1502 values as they are legible ones.
scenario_4.loc[(scenario_4['fare_amount']<2.5)&(scenario_4['H_Distance']==0)]

In [None]:
scenario_4.loc[(scenario_4['fare_amount']>3.0)&(scenario_4['H_Distance']==0)]

In [None]:
len(scenario_4.loc[(scenario_4['fare_amount']>3.0)&(scenario_4['H_Distance']==0)])

27159 rows need to be imputed using the following formula -

distance = (fare_amount - 2.5)/1.56

In [None]:
scenario_4_sub = scenario_4.loc[(scenario_4['fare_amount']>3.0)&(scenario_4['H_Distance']==0)]

In [None]:
len(scenario_4_sub)

In [None]:
scenario_4_sub['H_Distance'] = scenario_4_sub.apply(lambda row: ((row['fare_amount']-2.50)/1.56), axis=1)


In [None]:
train.update(scenario_4_sub)

In [None]:
train.shape

In [None]:
#not including the pickup_datetime columns as datetime columns cannot be directly used while modelling. Features need to extracted from the 
#timestamp fields which will later be used as features for modelling.
train = train.drop(['key','pickup_datetime'], axis = 1)
test = test.drop(['key','pickup_datetime'], axis = 1)

### MODELLING AND PREDICTION

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train_df, test_df = train_test_split(train, test_size = 0.2)

In [None]:
x_train = train_df.iloc[:,train.columns!='fare_amount']
y_train = train_df['fare_amount'].values

In [None]:
X_test  = test_df.drop("fare_amount", axis=1)
Y_test = test_df["fare_amount"]

In [None]:
 x_train.shape,y_train.shape,X_test.shape,Y_test.shape

## RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(x_train, y_train)

predict tes data

In [None]:
y_randomf=rf.predict(X_test)

In [None]:
print(np.sqrt(mean_squared_error(Y_test,y_randomf)))

In [None]:
..

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=rf, X=x_train, y=y_train, cv=10)
accuracies.mean()
accuracies.std()


In [None]:
plt.figure(figsize=(15,7))
plt.scatter(Y_test,y_randomf)
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=rf, X=x_train, y=y_train, cv=50)
accuracies

In [None]:
corr_df=x_train.corr(method='pearson')
corr_df

In [None]:
import seaborn
seaborn.heatmap(corr_df,cmap='RdYlGn_r',vmax=1.0,vmin=-1.0,linewidth=2.5)
plt.show

In [None]:
corr=train.corr(method="pearson")
seaborn.heatmap(corr,vmax=1.0,vmin=-1.0,linewidth=3)
plt.show()