In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing as prep
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB

In [57]:
# Brining in data file
df = pd.read_csv('Uber Request Data.csv')

In [58]:
# Verifying import
df.head()

Unnamed: 0,Request id,Pickup point,Driver id,Status,Request timestamp,Drop timestamp
0,619,Airport,1.0,Trip Completed,11/7/2016 11:51,11/7/2016 13:00
1,867,Airport,1.0,Trip Completed,11/7/2016 17:57,11/7/2016 18:47
2,1807,City,1.0,Trip Completed,12/7/2016 9:17,12/7/2016 9:58
3,2532,Airport,1.0,Trip Completed,12/7/2016 21:08,12/7/2016 22:03
4,3112,City,1.0,Trip Completed,13-07-2016 08:33:16,13-07-2016 09:25:47


In [59]:
# Finding all possible outcomes for 'Status' column
df.Status.unique()

array(['Trip Completed', 'Cancelled', 'No Cars Available'], dtype=object)

In [60]:
# Replacing spaces in columns with underscores
df.columns = df.columns.str.replace(' ','_')
df.head()

Unnamed: 0,Request_id,Pickup_point,Driver_id,Status,Request_timestamp,Drop_timestamp
0,619,Airport,1.0,Trip Completed,11/7/2016 11:51,11/7/2016 13:00
1,867,Airport,1.0,Trip Completed,11/7/2016 17:57,11/7/2016 18:47
2,1807,City,1.0,Trip Completed,12/7/2016 9:17,12/7/2016 9:58
3,2532,Airport,1.0,Trip Completed,12/7/2016 21:08,12/7/2016 22:03
4,3112,City,1.0,Trip Completed,13-07-2016 08:33:16,13-07-2016 09:25:47


In [61]:
# Splitting Request_timestamp into individual columns
df[['Req_Date','Request_Time']] = df.Request_timestamp.str.split(" ",expand=True)
df.head()


Unnamed: 0,Request_id,Pickup_point,Driver_id,Status,Request_timestamp,Drop_timestamp,Req_Date,Request_Time
0,619,Airport,1.0,Trip Completed,11/7/2016 11:51,11/7/2016 13:00,11/7/2016,11:51
1,867,Airport,1.0,Trip Completed,11/7/2016 17:57,11/7/2016 18:47,11/7/2016,17:57
2,1807,City,1.0,Trip Completed,12/7/2016 9:17,12/7/2016 9:58,12/7/2016,9:17
3,2532,Airport,1.0,Trip Completed,12/7/2016 21:08,12/7/2016 22:03,12/7/2016,21:08
4,3112,City,1.0,Trip Completed,13-07-2016 08:33:16,13-07-2016 09:25:47,13-07-2016,08:33:16


In [62]:
# Splitting Drop_timestamp into individual columns
df[['Drop_Date','Drop_Time']] = df.Drop_timestamp.str.split(" ",expand=True)
df.head()

Unnamed: 0,Request_id,Pickup_point,Driver_id,Status,Request_timestamp,Drop_timestamp,Req_Date,Request_Time,Drop_Date,Drop_Time
0,619,Airport,1.0,Trip Completed,11/7/2016 11:51,11/7/2016 13:00,11/7/2016,11:51,11/7/2016,13:00
1,867,Airport,1.0,Trip Completed,11/7/2016 17:57,11/7/2016 18:47,11/7/2016,17:57,11/7/2016,18:47
2,1807,City,1.0,Trip Completed,12/7/2016 9:17,12/7/2016 9:58,12/7/2016,9:17,12/7/2016,9:58
3,2532,Airport,1.0,Trip Completed,12/7/2016 21:08,12/7/2016 22:03,12/7/2016,21:08,12/7/2016,22:03
4,3112,City,1.0,Trip Completed,13-07-2016 08:33:16,13-07-2016 09:25:47,13-07-2016,08:33:16,13-07-2016,09:25:47


In [63]:
# Dropping redundancy
df.drop(columns=['Request_timestamp', 'Drop_timestamp'])


Unnamed: 0,Request_id,Pickup_point,Driver_id,Status,Req_Date,Request_Time,Drop_Date,Drop_Time
0,619,Airport,1.0,Trip Completed,11/7/2016,11:51,11/7/2016,13:00
1,867,Airport,1.0,Trip Completed,11/7/2016,17:57,11/7/2016,18:47
2,1807,City,1.0,Trip Completed,12/7/2016,9:17,12/7/2016,9:58
3,2532,Airport,1.0,Trip Completed,12/7/2016,21:08,12/7/2016,22:03
4,3112,City,1.0,Trip Completed,13-07-2016,08:33:16,13-07-2016,09:25:47
...,...,...,...,...,...,...,...,...
6740,6745,City,,No Cars Available,15-07-2016,23:49:03,,
6741,6752,Airport,,No Cars Available,15-07-2016,23:50:05,,
6742,6751,City,,No Cars Available,15-07-2016,23:52:06,,
6743,6754,City,,No Cars Available,15-07-2016,23:54:39,,


In [64]:
# Converting status options to a mapped value, 1 = completed 2 = incomplete
outcomes = {
    'Trip Completed' : 1, 
    'No Cars Available' : 0, 
    'Cancelled' : 0, 
}

# Adding new column for mapped options
df['mapped_status_type'] = df['Status'].map(outcomes)

# Testing new column placement
df.head()

Unnamed: 0,Request_id,Pickup_point,Driver_id,Status,Request_timestamp,Drop_timestamp,Req_Date,Request_Time,Drop_Date,Drop_Time,mapped_status_type
0,619,Airport,1.0,Trip Completed,11/7/2016 11:51,11/7/2016 13:00,11/7/2016,11:51,11/7/2016,13:00,1
1,867,Airport,1.0,Trip Completed,11/7/2016 17:57,11/7/2016 18:47,11/7/2016,17:57,11/7/2016,18:47,1
2,1807,City,1.0,Trip Completed,12/7/2016 9:17,12/7/2016 9:58,12/7/2016,9:17,12/7/2016,9:58,1
3,2532,Airport,1.0,Trip Completed,12/7/2016 21:08,12/7/2016 22:03,12/7/2016,21:08,12/7/2016,22:03,1
4,3112,City,1.0,Trip Completed,13-07-2016 08:33:16,13-07-2016 09:25:47,13-07-2016,08:33:16,13-07-2016,09:25:47,1


### We will not be removing null values as those are crucial to the determination of "Status" in our model