# Cleaning the data
Question in mind - Can we accurately predict how much tips a driver will get based on pick-up location, the distance traveled to destination and time spent?


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import matplotlib.path as mplPath
#import rtree
#import fiona.crs
#import geopandas as gpd
#import pyproj
#import shapely.geometry as geom
%matplotlib inline

In [2]:
january2013=pd.read_csv("datasets/yellow_tripdata_2013-01.csv")

KeyboardInterrupt: 

Filter the data to only include coordinates within New York and drop the colums we don't need
such as the vendor_id. Remove bad passenger counts, you can't get tipped with 0 passengers nor can a taxi have 255 people at one time

In [3]:
january2013 = january2013.loc[(january2013['pickup_longitude'] > -74.06) & (january2013['pickup_longitude'] < -73.77) & (january2013['pickup_latitude'] > 40.61) &  (january2013['pickup_latitude'] < 40.91)]
january2013 = january2013.loc[(january2013['passenger_count'] > 0) & (january2013['passenger_count'] < 7)]
january2013 = january2013.reset_index()
january2013 = january2013.drop('index', 1)
january2013 = january2013.drop('vendor_id',1)
january2013 = january2013.drop('rate_code',1)
january2013 = january2013.drop('store_and_fwd_flag',1)
january2013 = january2013.drop('fare_amount',1)
january2013 = january2013.drop('surcharge',1)
january2013 = january2013.drop('mta_tax',1)
january2013 = january2013.drop('tolls_amount',1)

NameError: name 'january2013' is not defined

In [None]:
print(january2013.shape)
january2013.head()

Code to convert longitude/latitude to block id, might be useful later, we'll see

In [None]:
def indexZones(shapeFilename):
    index = rtree.Rtree()
    zones = gpd.read_file(shapeFilename).to_crs(fiona.crs.from_epsg(2263))
    for idx,geometry in enumerate(zones.geometry):
        index.insert(idx, geometry.bounds)
    return (index, zones)

In [None]:
def findBlock(p, index, zones):
    match = index.intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        z = mplPath.Path(np.array(zones.geometry[idx].exterior))
        if z.contains_point(np.array(p)):
            return zones['OBJECTID'][idx]
    return -1

In [None]:
def mapToZone(parts):
    proj = pyproj.Proj(init="epsg:2263", preserve_units=True)    
    index, zones = indexZones('datasets/block-groups-polygons.geojson')
    for line in parts:
        if (line['pickup_longitude'] and line['pickup_latitude']):
            pickup_location  = geom.Point(proj(float(line['pickup_longitude']), float(line['pickup_latitude'])))
            pickup_block = findBlock(pickup_location, index, zones)
            if pickup_block >= 0:
                print (pickup_block)
                
#mapToZone(data_pd.head(20).T.to_dict().values())

Looking at the time value, we find the day of the week, time spent to reach destination, and hour. We will insert all of these as new columns.

From datetime of pick-up we'll be breaking it down into month, day, day of the week, time in minutes

In [None]:
def add_data(df):
    df_timestamp = pd.to_datetime(pd.Series(df['pickup_datetime']))
    df['trip_distance']*0.621371
    df['weekday'] = df_timestamp.dt.dayofweek
    df['month'] = df_timestamp.dt.month
    df['hour'] = df_timestamp.dt.hour
    df['day'] = df_timestamp.dt.day
    df['minutes'] = (df_timestamp.dt.hour)*60 + df_timestamp.dt.minute
    time_spent = pd.to_datetime(df['dropoff_datetime']) - pd.to_datetime(df['pickup_datetime'])
    df['time_spent'] = pd.to_datetime(time_spent).dt.minute
    return df

In [None]:
january2013 = add_data(january2013)

In [None]:
january2013.head()

Does payment type affect the tips?

"CRD" -- card, debit or credit

"CSH" -- cash

"DIS" -- disputed fare 

"NOC" -- no charge

"UNK" -- unknown

In [None]:
print(january2013.payment_type.unique()) #the types of payment

In [None]:
paymentandtips = january2013.groupby('payment_type')
print(paymentandtips.size())
print(paymentandtips.size().plot(kind='bar'))

In [None]:
print(paymentandtips.tip_amount.mean())
print(paymentandtips.tip_amount.mean().plot(kind='bar'))

The highest two payment types are crd and csh but the crd and unk are the payment types with the highest average types, probably wouldn't be interesting to explore since unk is a very uncommon payment type, so let's look at the tip percentage which it 

(tip)/(total cost)*100

which is the average tips a person gives in relation to the total cost of the ride

In [None]:
avgtips = (january2013.tip_amount/january2013.total_amount)*100
january2013['avg_tip'] = avgtips
january2013.head()

Create a new data set that only includes where tips > 0.0

In [None]:
january2013_remove_non_tips = january2013.loc[(january2013['tip_amount'] > 0.0)]
january2013_remove_non_tips = january2013_remove_non_tips.reset_index()
january2013_remove_non_tips = january2013_remove_non_tips.drop('index', 1)
january2013_remove_non_tips.head()

Does passenger count have any correlation to amount of tips given?

In [None]:
peopleandtips = january2013.groupby('passenger_count')
print(peopleandtips.size())
print(peopleandtips.tip_amount.mean())
print(peopleandtips.tip_amount.mean().plot(kind='bar'))

Looking at the graph above which is the average tip based on passenger with those passengers who don't tip, they seem to pay about the same. While looking at the bottom graph without passengers who don't pay tips, the graph is about the same, so passenger_count is irrelevant

In [None]:
peopleandtips = january2013_remove_non_tips.groupby('passenger_count')
print(peopleandtips.size())
print(peopleandtips.tip_amount.mean())
print(peopleandtips.tip_amount.mean().plot(kind='bar'))

In [None]:
january2013 = january2013.drop('passenger_count',1)
january2013_remove_non_tips = january2013_remove_non_tips.drop('passenger_count',1)
january2013.head()

Let's look at Distance Traveled (km) and Tip Percentage (Frequency of tips)

In [None]:
#january2013_remove_non_tips.plot(x='trip_distance',y='avg_tip',style=['o','rx'])

Can we estimate how much tips a taxi driver will be given based on the day of the week, the hour they were picked up, the location they picked up, how much time they spent in the taxi and how far they went?

In [None]:
january2013['pickup'] = january2013['pickup_latitude'].map(str) +','+january2013['pickup_longitude'].map(str)
january2013.head() #we merge the pickup lat and long for each row so we can group by pickup location

In [None]:
bayesJanuary = january2013[['pickup_longitude','pickup_latitude','trip_distance','weekday','pickup','hour','time_spent','tip_amount']]
print(bayesJanuary.shape)
bayesJanuary.head()

Break it into steps

If we just look at the day of the week they were picked up, how much would a taxi driver be given?

In [None]:
print(bayesJanuary.groupby('weekday').size())
print(bayesJanuary.groupby('weekday').tip_amount.mean().plot(kind='bar'))

If we just look at the day of the week they were picked up and the hour they were picked up, how much would a taxi driver be given?

In [None]:
print(bayesJanuary.groupby(['weekday','hour']).size())
print(bayesJanuary.groupby(['weekday','hour']).tip_amount.mean().plot(kind='bar'))

Let's look into one day of the week to understand what's going on...

In [None]:
bayesJanuarymonday = bayesJanuary.loc[(bayesJanuary['weekday'] == 0)]
print(bayesJanuarymonday.shape)
bayesJanuarymonday.head()

In [None]:
print(bayesJanuarymonday.groupby('hour').size())
print(bayesJanuarymonday.groupby('hour').tip_amount.mean().plot(kind='bar'))

So for each hour we can see the average tips, so at 5am an taxi driver can get the highest tips overall. What if we look into each location at 5am?

In [None]:
bayesJanuarymonday5am = bayesJanuarymonday.loc[(bayesJanuarymonday['hour'] == 5)]
print(bayesJanuarymonday5am.shape)
bayesJanuarymonday5am.head()

In [None]:
#print(bayesJanuarymonday5am.groupby('pickup').size())
averagetipsperlocation = bayesJanuarymonday5am.groupby('pickup').tip_amount.mean()
averagetimeperlocation = bayesJanuarymonday5am.groupby('pickup').time_spent.mean()
averagedistancesperlocation = bayesJanuarymonday5am.groupby('pickup').trip_distance.mean()
avg_tip_loc_array = []
avg_time_loc_array = []
avg_distance_loc_array = []
for data in averagetipsperlocation:
    avg_tip_loc_array.append(data)
for data in averagetimeperlocation:
    avg_time_loc_array.append(data)
for data in averagedistancesperlocation:
    avg_distance_loc_array.append(data)

In [None]:
bayesJanuarymonday5am = bayesJanuarymonday5am.drop_duplicates('pickup')
bayesJanuarymonday5am['avg_tip'] = avg_tip_loc_array
bayesJanuarymonday5am['avg_time_spent'] = avg_time_loc_array
bayesJanuarymonday5am['avg_distance'] = avg_distance_loc_array
bayesJanuarymonday5am.head()

In [None]:
bayesJanuarymonday5am_remove_non_tips = bayesJanuarymonday5am.loc[(bayesJanuarymonday5am['avg_tip'] > 0.0)]
bayesJanuarymonday5am_remove_non_tips.head()

In [None]:
bayesJanuarymonday5am.shape, bayesJanuarymonday5am_remove_non_tips.shape

We now know the average tips given to a taxi driver based on day of the week and the hour the customer was picked up for each location at 5 am in the morning on a Monday