# Tutorial: Interactive Data Viz and Time Series Analysis
## Created by Sarah Shy
#### March 12, 2018
#### 15-388 Practical Data Science, CMU

# 1. Load the Data

https://austinbcycle.com/how-it-works/faqs

First, we load the libraries and data into our environment.

Note that if you have not installed plotly, you can install by running the following code in your terminal:

\$ pip install plotly 
or 
\$ sudo pip install plotly 

To install cufflinks: ! pip install cufflinks --upgrade


In [1]:
#load packages
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf
import numpy as np
import plotly.tools as tls

In [2]:
#load data
bike_trips = pd.read_csv("austin_bikeshare_trips.csv")
bike_stations = pd.read_csv("austin_bikeshare_stations.csv")

The _bike trips_ dataset contains the following variables:
-  bikeid: integer id of bike
-  checkout_time: HH:MM:SS, see start time for date stamp
-  duration_minutes: integer minutes of trip duration
-  end_station_id: integer id of end station
-  end_station_name: string of end station name
-  month: month, integer (1 = January)
-  start_station_id: integer id of start station
-  start_station_name: string of start station name
-  start_time: YYYY-MM-DD HH:MM:SS, string
-  subscriber_type: membership type
-  trip_id: unique trip id, int
-  year: year of trip, int


The _bike stations_ dataset contains the following variables:
-  latitude: geospatial latitude, precision to 5 places
-  location: (lat, lon)
-  longitude: geospatial longitude, precision to 5 places
-  name: station name, str
-  stations_id: unique station id, int
-  status: station status (active, closed, moved, ACL-only), ACL is a music festival

_Note: variable definitions were taken from the dataset's Kaggle description: https://www.kaggle.com/jboysen/austin-bike._

In [None]:
#
print(bike_trips.head())
bike_stations.head()
#list(bike_data)
#type(bike_data['start_time'][0]) #string
#bike_trips.shape #649231 by 12
#bike_stations.shape #72 by 6

| This | is   |
|------|------|
|   a  | table|

In [4]:
from IPython.display import display, HTML

display(bike_trips.head())
display(bike_stations.head())

Unnamed: 0,bikeid,checkout_time,duration_minutes,end_station_id,end_station_name,month,start_station_id,start_station_name,start_time,subscriber_type,trip_id,year
0,8.0,19:12:00,41,2565.0,Trinity & 6th Street,3.0,2536.0,Waller & 6th St.,2015-03-19 19:12:00,Walk Up,9900082882,2015.0
1,141.0,2:06:04,6,2570.0,South Congress & Academy,10.0,2494.0,2nd & Congress,2016-10-30 02:06:04,Local365,12617682,2016.0
2,578.0,16:28:27,13,2498.0,Convention Center / 4th St. @ MetroRail,3.0,2538.0,Bullock Museum @ Congress & MLK,2016-03-11 16:28:27,Local365,9075366,2016.0
3,555.0,15:12:00,80,2712.0,Toomey Rd @ South Lamar,11.0,2497.0,Capitol Station / Congress & 11th,2014-11-23 15:12:00,24-Hour Kiosk (Austin B-cycle),9900319298,2014.0
4,86.0,15:39:13,25,3377.0,MoPac Pedestrian Bridge @ Veterans Drive,4.0,2707.0,Rainey St @ Cummings,2017-04-16 15:39:13,Walk Up,14468597,2017.0


Unnamed: 0,latitude,location,longitude,name,station_id,status
0,30.27041,(30.27041 -97.75046),-97.75046,West & 6th St.,2537,active
1,30.26452,(30.26452 -97.7712),-97.7712,Barton Springs Pool,2572,active
2,30.27595,(30.27595 -97.74739),-97.74739,ACC - Rio Grande & 12th,2545,closed
3,30.2848,(30.2848 -97.72756),-97.72756,Red River & LBJ Library,1004,closed
4,30.26694,(30.26694 -97.74939),-97.74939,Nueces @ 3rd,1008,moved


In [3]:
# TAKE CARE OF MISSING DATA

len(bike_trips[pd.isnull(bike_trips).any(axis=1)])
#there are 67606 rolws in bike trips with missing data

len(bike_stations[pd.isnull(bike_stations).any(axis=1)])
#there are 0 rows in bike stations with missing data

#REMOVE MISSING DATA ROWS FROM BIKE_TRIPS
bike_trips_clean = bike_trips.copy()
bike_trips_clean = bike_trips.dropna()
#bike_trips_clean = bike_trips[bike_trips.year == 2017.0]
#bike_trips_clean.shape

# 2. Plotting Trip Duration

In [None]:
#PLOTTING duration of trips. We cut off at 6 hours because there's a long tail with few data points. With membership, maximum is 1 hour before charging extra.

duration_data = bike_trips_clean.duration_minutes[bike_trips_clean.duration_minutes <= 360]
duration_data.iplot(kind = 'histogram', filename = "trip-duration", bins = 72)


#bike_trips.duration_minutes.iplot(kind = 'histogram',  filename='simple histogram') #uses cufflink

In [5]:
import IPython
iframe = '<iframe width="900" height="500" frameborder="0" scrolling="no" src="//plot.ly/~sarahshy/44.embed"></iframe>'
IPython.display.HTML(iframe)

So mostly short trips with 5-9 being most common trip time with over 132,000 trips. *******Compare to table

But what if interested in how the duration changes with time of year? Use 'restyle' dropdowns.

# Adding a Dropdown for Season

In [10]:
#create df

df = pd.DataFrame({'fall': bike_trips_clean.duration_minutes[bike_trips_clean.month.between(9,11)][bike_trips_clean.duration_minutes <= 180],
                   'winter': bike_trips_clean.duration_minutes[bike_trips_clean.month.isin([12,1,2])][bike_trips_clean.duration_minutes <= 180],
                   'spring': bike_trips_clean.duration_minutes[bike_trips_clean.month.between(3,5)][bike_trips_clean.duration_minutes <= 180],
                   'summer': bike_trips_clean.duration_minutes[bike_trips_clean.month.between(6,8)][bike_trips_clean.duration_minutes <= 180]
                  })

573479

In [None]:
#plot

updatemenus = list([
    dict(active = -1,
        buttons=list([
            dict(label = 'All',
                method = 'update',
                args = [{'visible': [True, True, True, True]},
                        {'title': "Duration of Trips by Season"}]),
            dict(label = 'Fall',
                method = 'update',
                args = [{'visible': [True, False, False, False]},
                        {'title': 'Duration of Trips in Fall'}]),
            dict(label = 'Winter',
                method = 'update',
                args = [{'visible': [False, False, False, True]},
                        {'title': 'Duration of Trips in Fall'}]),
            dict(label = 'Spring',
                method = 'update',
                args = [{'visible': [False, True, False, False]},
                        {'title': 'Duration of Trips in Spring'}]),
            dict(label = 'Summer',
                method = 'update',
                args = [{'visible': [False, False, True, False]},
                        {'title': 'Duration of Trips in Spring'}])
        ]),)
])

layout = dict(title = "Duration of Trips", updatemenus = updatemenus, yaxis = dict(range = [0, 50000]), barmode='overlay', showlegend = False)


df.iplot(kind='histogram', barmode='overlay', filename = 'duration_by_season', layout=layout, bins = 72, shared_yaxes=True, theme = 'solar')
     #vline=[dict(x=df.a.mean(),color='#5283AD'), dict(x=df.b.mean(),color='#FDAB5A')])

In [9]:
iframe2 = '<iframe width="900" height="500" frameborder="0" scrolling="no" src="//plot.ly/~sarahshy/60.embed"></iframe>'
IPython.display.HTML(iframe2)

Fall and Spring the most popular times to use bike share.

# 3. Plotting Number of Rides Over Time

In [17]:
# get date

bike_trips_clean['start_time'] = pd.to_datetime(bike_trips_clean['start_time']) #convert to datetime object
bike_trips_clean['date'] = bike_trips_clean.start_time.dt.date



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [12]:
# get trip counts and create df for plotting

trip_count = bike_trips_clean.groupby(['date']).count()

trips_time_df = pd.DataFrame({'trip_count': trip_count.start_time})
print(trips_time_df.head())

            trip_count
date                  
2013-12-21         103
2013-12-22         117
2013-12-23          96
2013-12-24          85
2013-12-25         145


In [None]:
#plot
trips_time_df.iplot(filename = 'trips_over_time')

In [22]:
iframe3 = '<iframe width="900" height="500" frameborder="0" scrolling="no" src="//plot.ly/~sarahshy/69.embed"></iframe>'
IPython.display.HTML(iframe3)

# Hard to see, add slider

In [18]:
layout = dict(
    title='Time Series with Rangeslider',
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label='1m',
                     step='month',
                     stepmode='backward'),
                dict(count=6,
                     label='6m',
                     step='month',
                     stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(),
        type='date'
    )
)

trips_time_df.iplot(layout = layout, filename = 'trips_over_time_slider')

In [24]:
iframe4 = '<iframe width="900" height="600" frameborder="0" scrolling="no" src="//plot.ly/~sarahshy/71.embed"></iframe>'
IPython.display.HTML(iframe4)

Interestingly, huge spikes in end of March, beginning of April every year.

# Map
Using MapBox since Plotly can only plot country level (i.e. if mapping out USA)

In [61]:
bike_stations.head()

Unnamed: 0,latitude,location,longitude,name,station_id,status
0,30.27041,(30.27041 -97.75046),-97.75046,West & 6th St.,2537,active
1,30.26452,(30.26452 -97.7712),-97.7712,Barton Springs Pool,2572,active
2,30.27595,(30.27595 -97.74739),-97.74739,ACC - Rio Grande & 12th,2545,closed
3,30.2848,(30.2848 -97.72756),-97.72756,Red River & LBJ Library,1004,closed
4,30.26694,(30.26694 -97.74939),-97.74939,Nueces @ 3rd,1008,moved


In [4]:
#groupby pandas to get the count at each station
station_count_df = pd.DataFrame({'station_trip_count': bike_trips.groupby(['start_station_name']).size()})

#join with bike_stations dataframe because we'll need the gps coordinates to plot
bike_stations_count = bike_stations.join(station_count_df, on = 'name')

#check that all stations from bike_trips and bike_stations matched up
print(len(bike_stations) == len(bike_stations_count)) #72 bike stations

(bike_stations_count['station_trip_count'].mean())

True


9002.6

In [63]:
limits = [(0,500),(501,1000),(1001,5000),(5001,10000),(10001,30000)]
colors = ["rgb(0,116,217)","rgb(255,65,54)","rgb(133,20,75)","rgb(255,133,27)","lightgrey"]
stations = []
scale = 1

for i in range(len(limits)):
    lim = limits[i]
    df_sub = bike_stations_count[lim[0]:lim[1]] #### order inc first???
    station = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['longitude'],
        lat = df_sub['latitude'],
        text = df_sub['name'],
        marker = dict(
            size = df_sub['station_trip_count']/scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'), ##### do I need this??
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    stations.append(station)

layout = dict(
        title = 'Start Station Popularity<br>(Click legend to toggle traces)',
        showlegend = True,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict( data=stations, layout=layout )
py.iplot(fig, validate=False, filename='stations-bubble-map' )

In [34]:
from plotly.graph_objs import *

mapbox_access_token = 'pk.eyJ1Ijoic2FyYWhzaHkiLCJhIjoiY2pmYTFtZmtuMnRnbzJ3bG45dDV0MmhtcSJ9.FP2A6vyZ1u1Bm2SL3z84gQ'
scale = 500

data = Data([
    Scattermapbox(
        lat = bike_stations_count['latitude'],
        lon = bike_stations_count['longitude'],
        mode = 'markers',
        marker = Marker(
            size = (bike_stations_count['station_trip_count'])/scale,
            color = 'orange'
        ),
        text = bike_stations_count['name'],
        hoverinfo = 'station_trip_count'
    )
])
layout = Layout(
    title = 'Start Station Popularity',
    autosize = True,
    hovermode = 'closest',
    mapbox = dict(
        accesstoken = mapbox_access_token,
        bearing = 0,
        center = dict(
            lat = 30.267,
            lon = -97.743
        ),
        pitch = 0,
        zoom = 12
    ),
)

fig = dict(data = data, layout = layout)
py.iplot(fig, filename = 'stations-bubble-map-mapbox')

In [25]:
limits = [(0,500),(501,1000),(1001,5000),(5001,10000),(10001,30000)]
colors = ["rgb(0,116,217)","rgb(255,65,54)","rgb(133,20,75)","rgb(255,133,27)","blue"]

color = []
for count in bike_stations_count['station_trip_count']:
    if count < 500: color.append(colors[0])
    elif count < 1000: color.append(colors[1])
    elif count < 5000: color.append(colors[2])
    elif count < 10000: color.append(colors[3])
    else: color.append(colors[4])

bike_stations_count['color'] = color #add as column of df

data = Data([
    Scattermapbox(
        lat = bike_stations_count['latitude'],
        lon = bike_stations_count['longitude'],
        mode = 'markers',
        marker = Marker(
            size = bike_stations_count['station_trip_count']/scale,
            color = bike_stations_count['color']
        ),
        text = bike_stations_count['name'],
    )
])
layout = Layout(
    autosize = True,
    hovermode = 'closest',
    showlegend = True,
    mapbox = dict(
        accesstoken = mapbox_access_token,
        bearing = 0,
        center = dict(
            lat = 30.267,
            lon = -97.743
        ),
        pitch = 0,
        zoom = 12
    ),
)

fig = dict(data = data, layout = layout)
py.iplot(fig, filename = 'stations-bubble-map-mapbox-legend')