In [213]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn import pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

# Generate Dataframe
df = pd.read_csv("uber-fares-dataset/uber.csv")

## Cleaning the Data

Before we start formally cleaning the data, we made sure to take a look at the data we are working with as well as their data types and potential missing values.

Right off the bat, we noted that some latitude and longitude values do not fall within the actual latitude and longitude ranges.
For example, latitude values range between -90 and 90 whereas longitude values range between -180 and 180. Values like -3356.66630 for the longitude will skew the data, so we will need to handle their observational units accordingly.

In [214]:
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [215]:
df.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


From the .info() function we found that the longitude and latitude values contain a null value. We will need to address that later during our data cleanup.

In [216]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


To combat the rather strange values in the latitude and longitude columns, we will need to remove any observational units that do not fall within the specified latitude and longitude range. 

Before doing so, we will first need to drop any null values from our dataset.

In [217]:
df = df.dropna()

In [218]:
# Drop Pickup Latitude & Longitude Values Outside of Range
df.drop(df.loc[df['pickup_latitude'] > 90].index, inplace=True)
df.drop(df.loc[df['pickup_latitude'] < -90].index, inplace=True)
]
df.drop(df.loc[df['pickup_longitude'] > 180].index, inplace=True)
df.drop(df.loc[df['pickup_longitude'] < -180].index, inplace=True)


In [219]:
# Drop Dropoff Latitude & Longitude Values Outside of Range
df.drop(df.loc[df['dropoff_latitude'] > 90].index, inplace=True)
df.drop(df.loc[df['dropoff_latitude'] < -90].index, inplace=True)

df.drop(df.loc[df['dropoff_longitude'] > 180].index, inplace=True)
df.drop(df.loc[df['dropoff_longitude'] < -180].index, inplace=True)


From taking a look at .info() and .describe, we can see that there are no longer any null values and all the latitude and longitude values fall within normal ranges. Now we can start with our preprocessing.

In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199987 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         199987 non-null  int64  
 1   key                199987 non-null  object 
 2   fare_amount        199987 non-null  float64
 3   pickup_datetime    199987 non-null  object 
 4   pickup_longitude   199987 non-null  float64
 5   pickup_latitude    199987 non-null  float64
 6   dropoff_longitude  199987 non-null  float64
 7   dropoff_latitude   199987 non-null  float64
 8   passenger_count    199987 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 15.3+ MB


In [221]:
df.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,199987.0,199987.0,199987.0,199987.0,199987.0,199987.0,199987.0
mean,27712840.0,11.359849,-72.501786,39.917937,-72.511608,39.922031,1.684544
std,16013840.0,9.901868,10.449955,6.130412,10.412192,6.117669,1.385999
min,1.0,-52.0,-93.824668,-74.015515,-75.458979,-74.01575,0.0
25%,13825540.0,6.0,-73.992064,40.734793,-73.991407,40.733823,1.0
50%,27746260.0,8.5,-73.981822,40.752592,-73.980092,40.753042,1.0
75%,41555540.0,12.5,-73.967154,40.767157,-73.963658,40.768,2.0
max,55423570.0,499.0,40.808425,48.01876,40.831932,45.031598,208.0


## Scaling, Normalizing, and Standardizing Features using sklearn.preprocessing

Before we jump into anything else, we decided on preprocessing the data in order to standardize the dataset. This will make the rest of the process run smoother because the data will "change raw feature vectors into a representation that is more suitable for the downstream estimators." (Source: https://scikit-learn.org/stable/modules/preprocessing.html)

Before running the scaler on the data, we will need to drop any categorical variables/any variables that are not in a number format because the scaler only works with number data.

In [222]:
# Drop the key and pickup_datetime and save those variables in a different dataframe (to be used later)
df_scale = df.drop({"key", "pickup_datetime"}, axis = 1)
df_labels = df[{"key", "pickup_datetime"}]

Then, we will run the StandardScaler on the dataframe and check the mean and scale.

In [223]:
scaler = preprocessing.StandardScaler().fit(df_scale)

In [224]:
scaler.mean_

array([ 2.77128422e+07,  1.13598491e+01, -7.25017858e+01,  3.99179366e+01,
       -7.25116077e+01,  3.99220310e+01,  1.68454450e+00])

In [225]:
scaler.scale_

array([1.60138026e+07, 9.90184307e+00, 1.04499290e+01, 6.13039620e+00,
       1.04121664e+01, 6.11765342e+00, 1.38599573e+00])

Next, we will transform the scaler and store it in a new dataframe and also check the mean and standard deviation.

In [226]:
df_scaled = scaler.transform(df_scale)
df_scaled.mean(axis=0)

array([-3.00223820e-18, -4.29195710e-17,  5.18712150e-16, -7.81665581e-16,
       -1.09253047e-17,  1.68160869e-16,  7.56066615e-17])

In [227]:
df_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1.])

## Use sklearn.tree.DecisionTreeRegressor

In [228]:
X, y = load_diabetes(return_X_y=True)
regressor = DecisionTreeRegressor(random_state=0)
cross_val_score(regressor, X, y, cv=10)

array([-0.39292219, -0.46749346,  0.02768473,  0.06441362, -0.50323135,
        0.16437202,  0.11242982, -0.73798979, -0.30953155, -0.00137327])

In [229]:
pipe = make_pipeline(StandardScaler(), DecisionTreeRegressor())
pipe.fit(X, y)
pipe.score(X, y)

1.0