# Uber Fare Prediction

## Here the main task is to predict the taxi fare amount 

In [1]:
import numpy as np 
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

## Loading Dataset

In [2]:
train_df = pd.read_csv(r'C:\Users\KIIT\Documents\LGM-Soc contributions\Uber Fare Prediction\Dataset/train.csv',nrows = 1000000)

In [3]:
train_df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
train_df.shape

(2000000, 8)

In [5]:
train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,2000000.0,2000000.0,2000000.0,1999986.0,1999986.0,2000000.0
mean,11.34779,-72.52321,39.92963,-72.52395,39.92808,1.684113
std,9.852883,12.86804,7.983352,12.77497,10.32382,1.314982
min,-62.0,-3377.681,-3458.665,-3383.297,-3461.541,0.0
25%,6.0,-73.99208,40.73491,-73.99141,40.734,1.0
50%,8.5,-73.98181,40.75263,-73.98016,40.75312,1.0
75%,12.5,-73.96713,40.7671,-73.96369,40.76809,2.0
max,1273.31,2856.442,2621.628,3414.307,3345.917,208.0


## Data Preprocessing and Visualization

In [6]:
train_df.isnull().sum()

key                   0
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    14
dropoff_latitude     14
passenger_count       0
dtype: int64

### Eliminating *null* values

In [7]:
train_df.dropna(inplace=True)
train_df.isnull().sum()

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [8]:
train_df.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

### Change required columns into Datetime format

In [9]:
def change_into_datetime(col):
    train_df[col] = pd.to_datetime(train_df[col])

In [10]:
train_df.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [11]:
for i in ['key','pickup_datetime']:
    change_into_datetime(i)

KeyboardInterrupt: 

In [None]:
train_df.dtypes

### Extracting the year, month, date and week day of the ride

In [None]:
train_df['pickup_year'] = train_df['pickup_datetime'].dt.year
train_df['pickup_month'] = train_df['pickup_datetime'].dt.month
train_df['pickup_date'] = train_df['pickup_datetime'].dt.day
train_df['pickup_day_of_month'] = train_df['pickup_datetime'].dt.dayofweek

In [None]:
train_df

### Extracting the Hour, Minute and Seconds value

In [None]:
def extract_hour(train_df,col):
    train_df[col + '_hour'] = train_df[col].dt.hour

def extract_minute(train_df,col):
    train_df[col + '_minute'] = train_df[col].dt.minute

def extract_second(train_df,col):
    train_df[col + '_second'] = train_df[col].dt.second

In [None]:
extract_hour(train_df,'pickup_datetime')
extract_minute(train_df,'pickup_datetime')
extract_second(train_df,'pickup_datetime')

In [None]:
train_df.drop(columns=['key','pickup_datetime'], axis=1, inplace=True)

In [None]:
train_df

### Removing the Latitude values that are greater than 90 or lesser than -90

In [None]:
train_df['pickup_latitude'].describe()

In [None]:
train_df['dropoff_latitude'].describe()

In [None]:
len(train_df[train_df['pickup_latitude'] < -90 ]) #number of records which has latitude < -90

In [None]:
len(train_df[train_df['pickup_latitude'] > 90 ])  #number of records which has latitude > 90

In [None]:
len(train_df[train_df['dropoff_latitude'] < -90 ]) #number of records which has latitude < -90

In [None]:
len(train_df[train_df['dropoff_latitude']  > 90 ]) #number of records which has latitude > 90

In [None]:
train_df.drop(train_df[train_df['pickup_latitude'] < -90].index, axis = 0, inplace = True)
train_df.drop(train_df[train_df['pickup_latitude'] > 90].index, axis = 0, inplace = True)

train_df.drop(train_df[train_df['dropoff_latitude'] < -90].index, axis = 0, inplace = True)
train_df.drop(train_df[train_df['dropoff_latitude'] > 90].index, axis = 0, inplace = True)

### Removing the Longitude values that are greater than 180 or lesser than -180

In [None]:
train_df['pickup_longitude'].describe()

In [None]:
train_df['dropoff_longitude'].describe()

In [None]:
len(train_df[train_df['pickup_longitude'] < -180])   #number of records which has longitude < -180

In [None]:
len(train_df[train_df['dropoff_longitude'] < -180])   #number of records which has longitude < -180

In [None]:
len(train_df[train_df['pickup_longitude'] > 180])   #number of records which has longitude > 180

In [None]:
len(train_df[train_df['dropoff_longitude'] > 180])   #number of records which has longitude > 180

In [None]:
train_df.drop(train_df[train_df['pickup_longitude'] < -180].index, axis = 0, inplace = True)
train_df.drop(train_df[train_df['pickup_longitude'] > 180].index, axis = 0, inplace = True)

train_df.drop(train_df[train_df['dropoff_longitude'] < -180].index, axis = 0, inplace = True)

In [None]:
train_df.shape

### Calculating the Haversine Distance

In [None]:
def haversine_distance(pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude):
    
    data = [train_df]
    
    for i in data:
        R = 6371 # radius of the erth in kilometers
        phi1 = np.radians(i[pickup_latitude])
        phi2 = np.radians(i[dropoff_latitude])
        
        delta_phi = np.radians(i[dropoff_latitude] - i[pickup_latitude])
        delta_lambda = np.radians(i[dropoff_longitude] - i[pickup_longitude])
        
        a = np.sin(delta_phi / 2.0) **2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
        
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        
        d = R * c
        
        i['harvesine_distance'] = d
        
    return d 

In [None]:
haversine_distance('pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')

In [None]:
train_df

### Removing all the rows where passengers count is more than 6

In [None]:
train_df['passenger_count'].describe()

In [None]:
train_df[train_df['passenger_count'] > 6].count()

In [None]:
train_df.drop(train_df[train_df['passenger_count'] > 6].index, axis = 0,inplace = True)

In [None]:
train_df[train_df['passenger_count'] > 6].count()

### Preprocessing the target column -> Fare amount

In [None]:
train_df['fare_amount'].describe()

In [None]:
from collections import Counter
Counter(train_df['fare_amount'] < 0)  #counting nagetive records

In [None]:
train_df = train_df.drop(train_df[train_df['fare_amount'] < 0].index, axis = 0)
train_df.shape

In [None]:
train_df['fare_amount'].sort_values(ascending = False)

## Visualizing the data

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x ='passenger_count', y ='fare_amount', data = train_df.sort_values('fare_amount', ascending=False))

plt.savefig(r'C:\Users\KIIT\Documents\LGM-Soc contributions\Uber Fare Prediction\Images\boxplot_1.png')

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x ='pickup_date', y ='fare_amount', data = train_df.sort_values('fare_amount', ascending=False))

plt.savefig(r'C:\Users\KIIT\Documents\LGM-Soc contributions\Uber Fare Prediction\Images\boxplot_2.png')

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x ='pickup_datetime_hour', y ='fare_amount', data = train_df.sort_values('fare_amount', ascending=False))

plt.savefig(r'C:\Users\KIIT\Documents\LGM-Soc contributions\Uber Fare Prediction\Images\boxplot_3.png')

In [None]:
train_df.head()

## Dealing with Outliers

In [None]:
def plot(df,col):
    fig, (ax1,ax2) = plt.subplots(2,1)
    sns.distplot(df[col], ax=ax1)
    sns.boxplot(df[col], ax=ax2)

In [None]:
plot(train_df,'fare_amount')

plt.savefig(r'C:\Users\KIIT\Documents\LGM-Soc contributions\Uber Fare Prediction\Images\outlier_plot.png')

In [None]:
train_df['fare_amount'] = np.where(train_df['fare_amount']>=600,train_df['fare_amount'].median(),train_df['fare_amount'])

In [None]:
plot(train_df,'fare_amount')

plt.savefig(r'C:\Users\KIIT\Documents\LGM-Soc contributions\Uber Fare Prediction\Images\final_outlier_plot.png')

In [None]:
train_df.shape

## Splitting the data

In [None]:
X = train_df.drop('fare_amount', axis=1)
X.head()

In [None]:
X.shape

In [None]:
y = train_df['fare_amount']
y.head()

### Applying Train_Test_Split

In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

## Model training

### Defining a model to pass any algorithm and to calculate the r2_score, MAE, MSE, RMSE


In [None]:
def predict(algorithm):
    model = algorithm.fit(X_train,y_train)
    print('Training Score: {}'.format(model.score(X_train,y_train)))

    preds = model.predict(X_test)
    print('Predictions are: {}'.format(preds))
    print('\n')

    r2_score = metrics.r2_score(y_test,preds)
    print('r2_score is:{}'.format(r2_score))

    print('MAE:',metrics.mean_absolute_error(y_test,preds))
    print('MSE:',metrics.mean_squared_error(y_test,preds))
    print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,preds)))
    sns.distplot(y_test-preds)

## Using RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

predict(RandomForestRegressor())

## Using Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

predict(LinearRegression())

## Using KNNs

In [None]:
from sklearn.neighbors import KNeighborsRegressor

predict(KNeighborsRegressor())

## Using Decision Tree 

In [None]:
from sklearn.tree import DecisionTreeRegressor 

predict(DecisionTreeRegressor())

## HyperParameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
n_estimators = [int(x) for x in np.linspace(start=100,stop=1200,num=6)]
max_depth = [int(x) for x in np.linspace(start=5,stop=30,num=4)]

In [None]:
random_params = {
    'n_estimators':n_estimators,
    'max_features':['auto','sqrt'],
    'max_depth':max_depth,
    'min_samples_split':[5,10,15,100]
}

In [None]:
random_params

In [None]:
rf = RandomForestRegressor()

In [None]:
tuned_rf =  RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_params,
    cv=3,
    verbose=2,
    n_jobs=-1
)

In [None]:
tuned_rf.fit(X_train,y_train)

In [None]:
tuned_rf.best_params_

In [None]:
predictions = tuned_rf.predict(X_test)

In [None]:
sns.distplot(y_test-predictions)

In [None]:
metrics.r2_score(y_test,predictions)

In [None]:
print('MAE:',metrics.mean_absolute_error(y_test,predictions))
print('MSE:',metrics.mean_squared_error(y_test,predictions))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,predictions)))

In [None]:
import pickle

file = open(r'C:\Users\KIIT\Documents\LGM-Soc contributions\Uber Fare Prediction\Model/hypertuned_rf_model.pkl','wb')

In [None]:
pickle.dump(tuned_rf, file)