In [1]:
import numpy as np 
import pandas as pd 

In [2]:
uber = pd.read_csv('./uber.csv')
uber.head(5)

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


# Dropping null values

In [3]:
uber.isnull().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

### Getting rid of first and second column, since key and ID are not useful in predictions.

In [4]:
uber2 = uber.drop(['Unnamed: 0','key'],axis=1)
uber2.dropna(axis=0,inplace=True)

In [5]:
uber2.head(5)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


##### Caculating distance between two locations haversine formula

In [6]:
def haversine(la1,lo1,la2,lo2):
    la1,lo1,la2,lo2 = map(np.radians,[la1,lo1,la2,lo2])
    
    difflon = lo2-lo1
    difflat = la2-la1
    
    km = 2 * 6371 * np.arcsin(np.sqrt(np.sin(difflat/2.0)**2 + 
                                      np.cos(la1) * np.cos(la2) * np.sin(difflon/2.0)**2))
    return km

In [7]:
# adding the ride distance column

uber2['distance'] = haversine(uber2['pickup_latitude'],uber2['pickup_longitude'],uber2['dropoff_latitude'],uber2['dropoff_longitude'])
uber2['distance'] = uber2['distance'].astype(float).round(2)

In [8]:
uber2.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,1.68
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1,2.46
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1,5.04
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,1.66
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,4.48


### standarization

In [9]:
x=uber2['distance'].values.reshape(-1,1)
y=uber2['fare_amount'].values.reshape(-1,1)

In [10]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
y_std = std.fit_transform(y)
x_std = std.fit_transform(x)

#### splitting the dataset

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_std, y_std, test_size=0.2, random_state=0)

#### Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
l_reg = LinearRegression()
l_reg.fit(X_train,y_train)

In [13]:
ypred = l_reg.predict(X_test)
df={'Actual':y_test,'Predicted':ypred}
from tabulate import tabulate
# print(tabulate(df, headers = 'keys', tablefmt = 'psql'))
print("Training set score: {:.2f}".format(l_reg.score(X_train, y_train)))
print("Test set score: {:.7f}".format(l_reg.score(X_test, y_test)))

Training set score: 0.00
Test set score: 0.0003219


### Random forest


In [14]:
# import library for random forest regressor
from sklearn.ensemble import RandomForestRegressor

In [15]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=10)

# fit the regressor with training dataset
rf_reg.fit(X_train, y_train)

  rf_reg.fit(X_train, y_train)


In [19]:
# predict the values on test dataset using predict()
y_pred_RF = rf_reg.predict(X_test)

In [20]:
y_pred_RF

array([-0.43274826,  2.12542431, -0.68562706, ...,  1.61206444,
        1.55823308, -0.3214416 ])

# Root mean square

In [16]:
from sklearn import metrics

In [21]:
print('Root Mean Squared Error of linear regression :', np.sqrt(metrics.mean_squared_error(y_test, ypred)))
print('Root Mean Squared Error of random forest model :', np.sqrt(metrics.mean_squared_error(y_test, y_pred_RF)))

Root Mean Squared Error of linear regression : 1.0157564600107931
Root Mean Squared Error of random forest model : 0.5771200321648626
