## Hackathon Hiring Challange Innomatics

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# reading uber rides data
df= pd.read_csv('uber_rides_data.xlsx - sample_train.csv')

In [3]:
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [4]:
df.shape

(200000, 8)

In [7]:
df.dtypes

ride_id                int64
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [8]:
# number of null values in each column
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [74]:
# dropping the null values
df =df.dropna()

In [10]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [12]:
df.describe()

Unnamed: 0,ride_id,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967153,40.767158,-73.963659,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [13]:
# converting pickupdate to datetime feature
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [14]:
# calculating the haversine distance
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

In [15]:
df['haversine_distance'] = haversine_array(df['pickup_latitude'].values, 
                                                     df['pickup_longitude'].values, 
                                                     df['dropoff_latitude'].values, 
                                                     df['dropoff_longitude'].values)

In [16]:
df['haversine_distance'].describe()

count    199999.000000
mean         20.855350
std         382.964642
min           0.000000
25%           1.215222
50%           2.120992
75%           3.875169
max       16409.239135
Name: haversine_distance, dtype: float64

In [17]:
df['haversine_distance'].value_counts()

0.000000    5632
0.000424       4
0.000778       2
0.000424       2
0.000556       2
            ... 
5.128535       1
1.537437       1
3.642422       1
0.711464       1
5.417783       1
Name: haversine_distance, Length: 194361, dtype: int64

In [19]:
# mean of fare amount where haversine distance is 0
df[df['haversine_distance']==0]['fare_amount'].mean()

11.585317826704546

In [21]:
# value of haversine distance for max fare amount i.e. 499
df[df['fare_amount']==499]['haversine_distance']

170081    0.00079
Name: haversine_distance, dtype: float64

In [27]:
# creating a new column year based on pickup_datetime
df['year'] = df['pickup_datetime'].dt.year

In [28]:
# number of rides in year 2014
df[df['year']==2014].count()

ride_id               29968
fare_amount           29968
pickup_datetime       29968
pickup_longitude      29968
pickup_latitude       29968
dropoff_longitude     29968
dropoff_latitude      29968
passenger_count       29968
haversine_distance    29968
year                  29968
dtype: int64

In [35]:
# creating a month column form pickup datetime 
df['month'] = df['pickup_datetime'].dt.month

In [39]:
# dataframe for first quater rides of 2014 
first_quarter_rides = df[(df['year']==2014) &(df['month']<=3)]

In [40]:
first_quarter_rides.shape

(7687, 11)

In [55]:
# day name form pickup datetime column
df['week_day'] = df['pickup_datetime'].dt.day_name()

In [60]:
# dataframe for september rides of year 2010
september_rides = df[(df['year']==2010) & (df['month']==9)]

In [61]:
rides_by_day = september_rides['week_day'].value_counts()

In [62]:
rides_by_day

Thursday     457
Wednesday    391
Saturday     362
Friday       354
Sunday       331
Tuesday      322
Monday       265
Name: week_day, dtype: int64

In [63]:
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_distance,year,month,week_day
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683323,2015,5,Thursday
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2.45759,2009,7,Friday
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,5.036377,2009,8,Monday
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.661683,2009,6,Friday
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,4.47545,2014,8,Thursday


In [65]:
# importing libraries 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [76]:
# creating target and dependent variables considering only passenger count, distance, week day
x= df[['passenger_count','haversine_distance','week_day']]
y = df['fare_amount']

In [77]:
# onehot encoding the week_day column
x = pd.get_dummies(x,columns=['week_day'],prefix=['day'])

In [78]:
x.head()

Unnamed: 0,passenger_count,haversine_distance,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
0,1,1.683323,0,0,0,0,1,0,0
1,1,2.45759,1,0,0,0,0,0,0
2,1,5.036377,0,1,0,0,0,0,0
3,3,1.661683,1,0,0,0,0,0,0
4,5,4.47545,0,0,0,0,1,0,0


In [79]:
# splitting data into train and test sets 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)

In [80]:
from sklearn.preprocessing import StandardScaler

In [81]:
# scaling the data using standardscalar
scalar =StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

In [82]:
# considering linear regression, randomforest, decision tree and KNN for modelling
models = [('Linear Regression', LinearRegression()), ('Random Forest Regression', RandomForestRegressor()),
          ('Decision Tree Regression', DecisionTreeRegressor()),('KNN Regressor',KNeighborsRegressor())]

In [89]:
results = []

for model_name, model in models:
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    n = len(X_test_scaled)
    p = len(X_train_scaled[0])
    adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))
    results.append((model_name, adjusted_r2))

In [90]:
results

[('Linear Regression', 0.0008512886841021139),
 ('Random Forest Regression', 0.6652190686613619),
 ('Decision Tree Regression', 0.5297601869462285),
 ('KNN Regressor', 0.6565313564252997)]

* **Linear Regression has the least adjusted r2 score**