In [None]:
#So I can see what's in the directories
import os
#If I want to know how long it takes to complete a block of code
import time
#To move my mapbox token function from the input to the working directory
from shutil import copyfile
#For cool map visuals with mapbox
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
#Because I like it when pandas works
import numpy as np 
#Well, it's Kaggle and I'm using Python so yeah, pandas.
import pandas as pd 
#Easy count plots heatmaps etc.
import seaborn as sns
#Never hurts to have this one and I'm using seaborn
import matplotlib.pyplot as plt
%matplotlib inline
#Move the python file with my function which returns my token
copyfile(src = "../input/mboxload/mapbox.py", dst = "../working/private_mapbox_access_token.py")
#Get the function I titled "mapboxtoken"
from private_mapbox_access_token import mapboxtoken
#Load the token string into a variable called token
token = mapboxtoken()
#Make sure all of my files are where they should be
!ls '../input'
!ls '../working'

Time the code. Take all of the files parse the dates to datetime and create one dataframe out of them.

In [None]:
start_time = time.time()
df1 = pd.read_csv(r'../input/divvy-bike-chicago-2018/Divvy_Trips_2018_Q1.csv',parse_dates=['start_time','end_time'])
df2 = pd.read_csv(r'../input/divvy-bike-chicago-2018/Divvy_Trips_2018_Q2.csv',parse_dates=['start_time','end_time'])
df3 = pd.read_csv(r'../input/divvy-bike-chicago-2018/Divvy_Trips_2018_Q3.csv',parse_dates=['start_time','end_time'])
df4 = pd.read_csv(r'../input/divvy-bike-chicago-2018/Divvy_Trips_2018_Q4.csv',parse_dates=['start_time','end_time'])
frames = [df1, df2, df3, df4]
df = pd.concat(frames)
print("--- {} seconds ---".format(time.time() - start_time))

Let's take a sample of the data and see what we're working with.

In [None]:
df.sample(3)

HEY! What the heck? There aren't any Latitiude and Longitude data points here.

So it looks like the City of Chicago has the Divvy Bike Station Data [here](http://https://data.cityofchicago.org/Transportation/Divvy-Bicycle-Stations/bbyy-e7gq). I downloaded it to my local machine and then added it using the "+Add Data" on the right. Now I'll load it up into a Dataframe titled "stations."

In [None]:
stations = pd.read_csv(r'../input/bikestations/Divvy_Bicycle_Stations.csv')

Take a quick look at some rows in the stations dataframe.

In [None]:
stations.sample(3)

It looks like stations['ID'] should match up with the df['from_station_id'] and df['to_station_id'] columns. Let's check.

In [None]:
df[df['from_station_id']==451].head(1)

Both dataframes show ID 451 as being "Sheridan Rd & Loyola Ave." So it looks like they match.
Now let's just make sure that there are the same number of ID's in both sets of data.

In [None]:
print('There are {} ID numbers in the stations dataframe'
      '\nand {} in the df dataframe'
      .format(stations['ID'].nunique(), df['from_station_id'].nunique()))

Since there are not as many ID's in the Divvy Dataframe as the Stations Dataframe I grabbed from the Chicago Website I'm going to do an inner merge. This means that the data will only be retained if the ID exists in both dataframes.

In [None]:
combined =  pd.merge(df, stations,how='inner', left_on = 'from_station_id', right_on = 'ID')

Now we will make a quick pretty heatmap to check for missing values.

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(combined.isnull(),cmap='viridis',yticklabels=False)

Looks like birthyear and gender are the only ones missing. Lets take a closer look at that.

In [None]:
combined[combined['gender'].isnull()|combined['birthyear'].isnull()].sample(8)

After looking at the dataframe where gender or birthyear is null it looks like they are both null in the same rows. Looking closer at this it appears that this is when the usertype is "Customer." So for subscribers this data is available, but if someone just pays with a credit card it is not. Now lets see the ratio of subscribers to customers.

In [None]:
sns.countplot(x='usertype', data = combined)

In [None]:
cust = len(combined[combined['usertype']=='Customer'])
subs = len(combined[combined['usertype']=='Subscriber'])
tot = len(combined) 
print('Users of Divvy in 2018 were {}% Customers and {}% Subscribers'.format(round((cust/tot)*100,2)
                                                                             ,round((subs/tot)*100,2)))

If we want we can break it down further by gender.

In [None]:
sns.countplot(x='usertype', hue = 'gender', data = combined)

Lets make some new columns with just the date.

In [None]:
combined['start_date'] = combined['start_time'].dt.date

Now lets make a timeseries. First we need to make a new dataframe with the date as the index.

In [None]:
datecount = pd.DataFrame(combined['trip_id'].groupby(combined['start_date']).count())

In [None]:
plt.figure(figsize=(20,12))
plt.plot(datecount)
plt.xlabel("Date")
plt.ylabel("Rides")

Well there's alot of volatility there. It could be because of weekends vs weekdays. It could be because of bad weather. Either way, lets smooth it out a bit with a rolling average of seven days.

In [None]:
plt.figure(figsize=(20,12))
plt.plot(datecount.rolling(7, min_periods=1).mean(),color = 'orange')
plt.xlabel("Date")
plt.ylabel("Rides")

We could do the same thing and break it dowwn by gender.

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
combined.groupby(['start_date','gender']).count()['trip_id'].unstack().rolling(7, min_periods=1).mean().plot(ax=ax)

Or user type etc.

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
combined.groupby(['start_date','usertype']).count()['trip_id'].unstack().rolling(7, min_periods=1).mean().plot(ax=ax)

Make a new column called day of week and populate it with the day of the week from the 'start_time' column.

In [None]:
combined['dayofweek'] = combined['start_time'].dt.day_name()

Get the counts for each day of the week.

In [None]:
daycount = pd.DataFrame(combined.groupby('dayofweek').agg('count')['trip_id'])

In [None]:
daycount.reset_index(inplace=True)

In [None]:
daycount.columns = ['dayofweek','trips']

Make our own sort order so the days are in order.

In [None]:
order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
daycount = daycount.groupby(['dayofweek']).sum().reindex(order) 

In [None]:
daycount.reset_index(inplace = True)

In [None]:
day = daycount['dayofweek']
cnt = daycount['trips']
data = [go.Bar(x=day, y = cnt)]

layout = go.Layout(title = "Sum of all trips for each day of week")
fig = go.Figure(data=data, layout=layout)
iplot(fig)

Now let's find the most popular stations throughout the year. For this we are going to need to group by the latitude and longitude because more than one station can be at the same address. Also, we need to sort it before we reset the index so the dataframe is ordered largest count to smallest.

In [None]:
freq = combined.groupby(['Latitude', 'Longitude','from_station_name']).agg('count')['trip_id'].sort_values(ascending=False).reset_index()

In [None]:
freq.columns = ['lat','lon','station','count']

Make a new dataframe called top50 containing just the 50 highest counts. 0-49

In [None]:
top50 = freq[freq.index<50]

In [None]:
top50

Make a plotly Scattermapbox using the latitude and longitude. Passed in "token" which I imported earlier from the python file I added to the dataset and moved to the working directory.

In [None]:
ls = ['Addr: {} \n Uses in 2018:{}'.format(top50['station'].iloc[i],top50['count'].iloc[i]) for i in range(len(top50))]
data = [go.Scattermapbox(
            lat= top50['lat'] ,
            lon= top50['lon'],
            mode='markers',
            text = ls,
            hoverinfo = 'text',
            marker=dict(
                #Why divided by 3000 here? Because if I didn't the red would fill up the screen and I just played
                #around with this number until it looked pretty. 
                size= top50['count']/3000,
                color = 'red',
                opacity = .8,
            ),
          )]
layout = go.Layout(autosize=False,
                   mapbox= dict(accesstoken=token,
                                bearing=10,
                                pitch=60,
                                zoom=12,
                                center= dict(
                                         lat=41.895278,
                                         lon=-87.636820),
                                #style= "mapbox://styles/shaz13/cjiog1iqa1vkd2soeu5eocy4i"
                               ),
                    width=900,
                    height=600, title = "Top 50 used stations in 2018")
fig = dict(data=data, layout=layout)
iplot(fig)

Just for fun we can use all of the latitude and longitude data from our freq dataframe to plot all of the Divvy Stations.

In [None]:
data = [go.Scattermapbox(
            lat= freq['lat'] ,
            lon= freq['lon'],
            customdata = freq['station'],
            mode='markers',
            marker=dict(
                size= 4,
                color = 'red',
                opacity = .8,
            ),
          )]
layout = go.Layout(autosize=False,
                   mapbox= dict(accesstoken=token,
                                bearing=10,
                                pitch=60,
                                zoom=10,
                                center= dict(
                                         lat=41.881832,
                                         lon=-87.623177),
                                #style= "mapbox://styles/shaz13/cjiog1iqa1vkd2soeu5eocy4i"
                               ),
                    width=900,
                    height=600, title = "Divvy Racks in 2018")
fig = dict(data=data, layout=layout)
iplot(fig)

That's it for now. Happy learning all!
HUGE shout out to @paultimothymooney. If I hadn't come across his [kernel](https://www.kaggle.com/paultimothymooney/map-of-chicago-bike-rack-locations). I would not have started exploring plotly with mapbox. 