Taxi trips data provided by the City of Chicago: https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew#column-menu

In [3]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from bokeh.plotting import figure, output_notebook, show # bokeh plotting library
# We'll show the plots in the cells of this notebook
output_notebook()

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [21]:
%time

df = pd.read_csv('./Taxi_Trips.csv')

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10 µs


In [22]:
list(df)

['Trip ID',
 'Taxi ID',
 'Trip Start Timestamp',
 'Trip End Timestamp',
 'Trip Seconds',
 'Trip Miles',
 'Pickup Census Tract',
 'Dropoff Census Tract',
 'Pickup Community Area',
 'Dropoff Community Area',
 'Fare',
 'Tips',
 'Tolls',
 'Extras',
 'Trip Total',
 'Payment Type',
 'Company',
 'Pickup Centroid Latitude',
 'Pickup Centroid Longitude',
 'Pickup Centroid Location',
 'Dropoff Centroid Latitude',
 'Dropoff Centroid Longitude',
 'Dropoff Centroid  Location',
 'Community Areas']

In [23]:
df.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location,Community Areas
0,d5998fbfffe47c48fea61cfa0288515e3294d26b,b718f0a029913fdc28836e2d29391b1b05110b1690c54b...,07/02/2013 02:15:00 PM,07/02/2013 02:30:00 PM,1440.0,0.0,,,,,...,$35.25,Credit Card,Taxi Affiliation Services,,,,,,,
1,d599926b0698fa3b7dd49f01d785b33a2295d71f,dd7341ff0099d3c6e4f72299b69cd3d8fccb783a22b32c...,11/21/2013 07:30:00 PM,11/21/2013 07:30:00 PM,660.0,2.0,,,,,...,$8.65,Cash,,,,,,,,
2,d599973b32cc994cc6bc26ddda49d084eb2db1d4,5de9514abc0c5b1996ab022380ac899452405559793822...,07/04/2013 09:15:00 PM,07/04/2013 09:15:00 PM,0.0,14.9,,,,,...,$29.85,Cash,,,,,,,,
3,d599982a54b29f5e24986f0702afba9fed42c1b4,1025e3aec9a251a432a6bca2bbe7a95bc032b369fd5be1...,10/09/2015 07:15:00 PM,10/09/2015 07:15:00 PM,0.0,0.0,,,,,...,$11.51,Credit Card,Chicago Elite Cab Corp. (Chicago Carriag,,,,,,,
4,d5999839b78189e4bddb6157473d61cb0de4bd83,6671be9be2eab2a962dc45142b4ce269aabd453d28a3c9...,06/21/2015 02:45:00 AM,06/21/2015 03:15:00 AM,1380.0,0.0,,,,,...,$33.85,Cash,,,,,,,,


In [26]:
df = df.rename(columns={'Dropoff Centroid  Location': 'Dropoff Centroid Location'})

In [33]:
df.dropna(subset=['Pickup Centroid Location', 'Dropoff Centroid Location'], how='all', inplace = True)

In [34]:
df.size

4992

Let's find longitude and latitudes range for pickup and dropoff locations:

In [35]:
print(np.min(df['Pickup Centroid Longitude']), np.min(df['Pickup Centroid Latitude']))
print(np.max(df['Pickup Centroid Longitude']), np.max(df['Pickup Centroid Latitude']))

print(np.min(df['Dropoff Centroid Longitude']), np.min(df['Dropoff Centroid Latitude']))
print(np.max(df['Dropoff Centroid Longitude']), np.max(df['Dropoff Centroid Latitude']))

-87.913624596 41.740205756
-87.551428197 42.009622881
-87.913624596 41.77887686
-87.592310855 42.009622881


Let's define a base plot

In [36]:
Chicago = x_range, y_range = ((-87.913624596, -87.551428197), (41.740205756, 42.009622881))

plot_width = int(1000)
plot_height = int(plot_width//1.2)

def base_plot(tools='pan, wheel_zoom, reset', plot_width=plot_width, plot_height=plot_height, **plot_args):
    p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
              x_range=x_range, y_range=y_range, outline_line_color=None,
              min_border=0, min_border_left=0, min_border_right=0,
              min_border_top=0, min_border_bottom=0, **plot_args)
    
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    return p

options = dict(line_color=None, fill_color='blue', size=5)

Here is a start locations scatter plot

In [37]:
%%time
# let's plot 10k sample pickup
# samples = df.sample(n=10000)
p = base_plot()

p.circle(x=df['Pickup Centroid Longitude'], y=df['Pickup Centroid Latitude'], **options)
show(p)

CPU times: user 48.2 ms, sys: 4.03 ms, total: 52.2 ms
Wall time: 51.3 ms
