# Tutorial: Interactive Data Viz and Time Series Analysis
## Created by Sarah Shy
#### March 12, 2018
#### 15-388 Practical Data Science, CMU

# 1. Load the Data

https://austinbcycle.com/how-it-works/faqs

First, we load the libraries and data into our environment.

Note that if you have not installed plotly, you can install by running the following code in your terminal:

\$ pip install plotly 
or 
\$ sudo pip install plotly 

To install cufflinks: ! pip install cufflinks --upgrade


In [8]:
#import IPython
#iframe = '<iframe width="900" height="800" frameborder="0" scrolling="no" src="//plot.ly/~sarahshy/44.embed"></iframe>'
#IPython.display.HTML(iframe)

In [None]:
#load packages
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf
import numpy as np
import plotly.tools as tls

In [None]:
#load data
bike_trips = pd.read_csv("austin_bikeshare_trips.csv")
bike_stations = pd.read_csv("austin_bikeshare_stations.csv")

The _bike trips_ dataset contains the following variables:
-  bikeid: integer id of bike
-  checkout_time: HH:MM:SS, see start time for date stamp
-  duration_minutes: integer minutes of trip duration
-  end_station_id: integer id of end station
-  end_station_name: string of end station name
-  month: month, integer (1 = January)
-  start_station_id: integer id of start station
-  start_station_name: string of start station name
-  start_time: YYYY-MM-DD HH:MM:SS, string
-  subscriber_type: membership type
-  trip_id: unique trip id, int
-  year: year of trip, int


The _bike stations_ dataset contains the following variables:
-  latitude: geospatial latitude, precision to 5 places
-  location: (lat, lon)
-  longitude: geospatial longitude, precision to 5 places
-  name: station name, str
-  stations_id: unique station id, int
-  status: station status (active, closed, moved, ACL-only), ACL is a music festival

_Note: variable definitions were taken from the dataset's Kaggle description: https://www.kaggle.com/jboysen/austin-bike._

In [None]:
#
bike_trips.head()
#bike_stations.head()
#list(bike_data)
#type(bike_data['start_time'][0]) #string
#bike_trips.shape #649231 by 12
#bike_stations.shape #72 by 6

In [None]:
# TAKE CARE OF MISSING DATA

len(bike_trips[pd.isnull(bike_trips).any(axis=1)])
#there are 67606 rolws in bike trips with missing data

len(bike_stations[pd.isnull(bike_stations).any(axis=1)])
#there are 0 rows in bike stations with missing data

#REMOVE MISSING DATA ROWS FROM BIKE_TRIPS
bike_trips_clean = bike_trips.copy()
bike_trips_clean = bike_trips.dropna()
#bike_trips_clean = bike_trips[bike_trips.year == 2017.0]
#bike_trips_clean.shape

# 2. Plotting Trip Duration

In [None]:
#PLOTTING duration of trips. We cut off at 4 hours because there's a long tail with few data points. With membership, maximum is 1 hour before charging extra.

duration_data = bike_trips_clean.duration_minutes[bike_trips_clean.duration_minutes <= 180]
duration_data.iplot(kind = 'histogram', filename = "duration-plot", bins = 72)
#print(len(duration_data))
#data = [go.Histogram(x = duration_data)]
#py.iplot(data, filename = 'trip-duration')
#duration_data.iplot(kind = 'histogram', filename = 'trip-duration')

#bike_trips.duration_minutes.iplot(kind = 'histogram',  filename='simple histogram') #uses cufflink

In [None]:
tls.get_embed("https://plot.ly/~sarahshy/44")

So mostly short trips with 5-9 being most common trip time with over 132,000 trips. *******Compare to table

But what if interested in how the duration changes with time of year? Use 'restyle' dropdowns.

# Adding a Dropdown for Season

In [None]:
#create df

df = pd.DataFrame({'fall': bike_trips_clean.duration_minutes[bike_trips_clean.month.between(9,11)][bike_trips_clean.duration_minutes <= 10],
                   'winter': bike_trips_clean.duration_minutes[bike_trips_clean.month.isin([12,1,2])][bike_trips_clean.duration_minutes <= 10],
                   'spring': bike_trips_clean.duration_minutes[bike_trips_clean.month.between(3,5)][bike_trips_clean.duration_minutes <= 10],
                   'summer': bike_trips_clean.duration_minutes[bike_trips_clean.month.between(6,8)][bike_trips_clean.duration_minutes <= 10]
                  })
len(df)

In [None]:
#plot

updatemenus = list([
    dict(active = -1,
        buttons=list([
            dict(label = 'All',
                method = 'update',
                args = [{'visible': [True, True, True, True]},
                        {'title': "Duration of Trips by Season"}]),
            dict(label = 'Fall',
                method = 'update',
                args = [{'visible': [True, False, False, False]},
                        {'title': 'Duration of Trips in Fall'}]),
            dict(label = 'Winter',
                method = 'update',
                args = [{'visible': [False, False, False, True]},
                        {'title': 'Duration of Trips in Fall'}]),
            dict(label = 'Spring',
                method = 'update',
                args = [{'visible': [False, True, False, False]},
                        {'title': 'Duration of Trips in Spring'}]),
            dict(label = 'Summer',
                method = 'update',
                args = [{'visible': [False, False, True, False]},
                        {'title': 'Duration of Trips in Spring'}])
        ]),)
])

layout = dict(title = "Duration of Trips", updatemenus = updatemenus, yaxis = dict(range = [0, 50000]), barmode='overlay', showlegend = False)


df.iplot(kind='histogram', barmode='overlay', filename = 'duration_by_season', layout=layout, bins = 72, shared_yaxes=True, theme = 'solar')
     #vline=[dict(x=df.a.mean(),color='#5283AD'), dict(x=df.b.mean(),color='#FDAB5A')])

In [None]:
tls.embed("https://plot.ly/~sarahshy/60")

In [13]:
iframe2 = '<iframe width="900" height="800" frameborder="0" scrolling="no" src="//plot.ly/~sarahshy/60.embed"></iframe>'
IPython.display.HTML(iframe2)

Fall and Spring the most popular times to use bike share.

# 3. Plotting Number of Rides Over Time

In [None]:
# get date

bike_trips_clean['start_time'] = pd.to_datetime(bike_trips_clean['start_time']) #convert to datetime object
bike_trips_clean['date'] = bike_trips_clean.start_time.dt.date

#print((pd.to_datetime(bike_trips_clean.start_time[0])))
#print(bike_trips_clean.start_time[0])

In [None]:
# get trip counts and create df for plotting

trip_count = bike_trips_clean.groupby(['date']).count()

trips_time_df = pd.DataFrame({'trip_count': trip_count.start_time})
print(trips_time_df.head())

In [None]:
#plot

trips_time_df.iplot(filename = 'trips_over_time')

# Hard to see, add slider

In [None]:
layout = dict(
    title='Time Series with Rangeslider',
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label='1m',
                     step='month',
                     stepmode='backward'),
                dict(count=6,
                     label='6m',
                     step='month',
                     stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(),
        type='date'
    )
)

trips_time_df.iplot(layout = layout, filename = 'trips_over_time_slider')