In [1]:
import folium
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import seaborn

Importing trip data for March 2016 and Weather data from Weather Underground

In [2]:
# Read the data files
bikedata = pd.read_csv('../../Data/CitiBike_Data/201603-citibike-tripdata.csv')
weather = pd.read_csv('./data/temperature/weather.csv')
weather.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
# Data manipulation
# Making both the keys in the dataframes of same type (<type 'datetime.date'>)
bikedata['dtstarttime'] = pd.to_datetime(bikedata.starttime, format="%m/%d/%Y %H:%M:%S")
bikedata['dtstoptime'] = pd.to_datetime(bikedata.stoptime, format="%m/%d/%Y %H:%M:%S")
bikedata['date'] = bikedata.dtstarttime.dt.date
weather['date'] = pd.to_datetime(weather.date, format="%Y-%m-%d").dt.date

# Set the startdate and stopdate - minutes and seconds reset to 0 (in the following format - 2016-03-01 06:00:00)
# This has been done so that we could aggregate departures and arrivals per hour to identify bike usage
bikedata['dtstartdatehour'] = bikedata.dtstarttime.apply(lambda x:x.replace(minute=0,second=0))
bikedata['dtstopdatehour'] = bikedata.dtstoptime.apply(lambda x:x.replace(minute=0,second=0))

In [4]:
bikedata.columns

Index([u'tripduration', u'starttime', u'stoptime', u'start station id',
       u'start station name', u'start station latitude',
       u'start station longitude', u'end station id', u'end station name',
       u'end station latitude', u'end station longitude', u'bikeid',
       u'usertype', u'birth year', u'gender', u'dtstarttime', u'dtstoptime',
       u'date', u'dtstartdatehour', u'dtstopdatehour'],
      dtype='object')

In [5]:
# TODO: Aggregate bikedata on dtstartdatehour and dtstopdatehour
# Which Start and End stations are the busiest?
## At what time are they busy?
# 

In [6]:
# Merging weather and bike data
df = pd.merge(bikedata, weather, on='date', how='left')
print df.head()

   tripduration          starttime           stoptime  start station id  \
0          1491  3/1/2016 06:52:42  3/1/2016 07:17:33                72   
1          1044  3/1/2016 07:05:50  3/1/2016 07:23:15                72   
2           714  3/1/2016 07:15:05  3/1/2016 07:26:59                72   
3           329  3/1/2016 07:26:04  3/1/2016 07:31:34                72   
4          1871  3/1/2016 07:31:30  3/1/2016 08:02:41                72   

  start station name  start station latitude  start station longitude  \
0   W 52 St & 11 Ave               40.767272               -73.993929   
1   W 52 St & 11 Ave               40.767272               -73.993929   
2   W 52 St & 11 Ave               40.767272               -73.993929   
3   W 52 St & 11 Ave               40.767272               -73.993929   
4   W 52 St & 11 Ave               40.767272               -73.993929   

   end station id          end station name  end station latitude     ...      \
0             427       Bus S

## Initial Data Analysis

#### How many trips start and end at the same station?

We can see that only 1.67% of the bikes start and end at the same station

In [7]:
float(len(df[df['start station id'] == df['end station id']]))*100/len(df)

1.6760134837665408

#### Which Start and End stations are the busiest?

In [8]:
df.groupby(by=['start station id'])['bikeid'].count().sort_values(ascending=False).head()

start station id
519    11634
293     7721
435     7493
497     6888
426     6006
Name: bikeid, dtype: int64

In [9]:
# df['dtstarttime'] = pd.to_datetime(df.starttime, format="%m/%d/%Y %H:%M:%S")
# Extract the Hour component from starttime and stoptime
startHour = df.starttime.str.split().str[1].str.split(':').str[0].astype('int')
endHour = df.stoptime.str.split().str[1].str.split(':').str[0].astype('int')

# All data where starttime' hour is before 9 am
# dfStopBeforeTen = df[endHour <= 9]

# All data where starttime' hour is after 4 pm
# dfStartAfterFour = df[startHour >= 16]

In [10]:
dfMorning = df[endHour <= 10]
dfEvening = df[endHour >= 16]

dfMorningStart = dfMorning.groupby(by=['start station id'])['bikeid'].count().sort_values(ascending=False)
dfMorningEnd = dfMorning.groupby(by=['end station id'])['bikeid'].count().sort_values(ascending=False)

dfEveningStart = dfEvening.groupby(by=['start station id'])['bikeid'].count().sort_values(ascending=False)
dfEveningEnd = dfEvening.groupby(by=['end station id'])['bikeid'].count().sort_values(ascending=False)

In [11]:
topN = 750

# We are finding the top N start and end stations id before 10am
dfTopMorningStart = dfMorningStart[dfMorningStart > topN]
dfTopMorningEnd = dfMorningEnd[dfMorningEnd > topN]

# We are finding the top N start and end stations id ater 4pm
dfTopEveningStart = dfEveningStart[dfEveningStart > topN]
dfTopEveningEnd = dfEveningEnd[dfEveningEnd > topN]

netMorningSource = df[df['end station id'].isin(dfTopMorningStart.index.values)]
netMorningSink = df[df['end station id'].isin(dfTopMorningEnd.index.values)]

netEveningSource = df[df['end station id'].isin(dfTopEveningStart.index.values)]
netEveningSink = df[df['end station id'].isin(dfTopEveningEnd.index.values)]


In [12]:
# Plotting just the morning traffic
# We can see that Net Sinks (red) are concentrated in the middle of NY, where most of the offices are
netMorningSourcePoints = netMorningSource[['start station latitude', 'start station longitude']]
netMorningSinkPoints = netMorningSink[['end station latitude', 'end station longitude']]

netMorningSourcePointsToPlot = netMorningSourcePoints.drop_duplicates()
netMorningSinkPointsToPlot = netMorningSinkPoints.drop_duplicates()

nyMapMorning = folium.Map(location=[40.7831, -73.9712], zoom_start=12)

for index, row in netMorningSourcePointsToPlot.iterrows():
    folium.CircleMarker([row.get(0), row.get(1)], radius=20, color='green').add_to(nyMapMorning)

for index, row in netMorningSinkPointsToPlot.iterrows():
    folium.CircleMarker([row.get(0), row.get(1)], radius=20, color='red').add_to(nyMapMorning)

nyMapMorning

In [13]:
# Plotting just the evening traffic
netEveningSourcePoints = netEveningSource[['start station latitude', 'start station longitude']]
netEveningSinkPoints = netEveningSink[['end station latitude', 'end station longitude']]

netEveningSourcePointsToPlot = netEveningSourcePoints.drop_duplicates()
netEveningSinkPointsToPlot = netEveningSinkPoints.drop_duplicates()

nyMapEvening = folium.Map(location=[40.7831, -73.9712], zoom_start=12)

for index, row in netEveningSourcePointsToPlot.iterrows():
    folium.CircleMarker([row.get(0), row.get(1)], radius=20, color='green').add_to(nyMapEvening)

for index, row in netEveningSinkPointsToPlot.iterrows():
    folium.CircleMarker([row.get(0), row.get(1)], radius=20, color='red').add_to(nyMapEvening)

nyMapEvening