In [1]:
from __future__ import print_function
__author__='Shalmali'

%pylab inline
import pandas as pd
import os
import sys
import numpy as np
import geocoder        #geocoder provides with zipcode information
from scipy.stats import norm
from scipy import stats
from scipy.optimize import curve_fit, minimize
from numpy import random
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
sns.set(context='notebook', style='whitegrid', palette='deep', font='sans-serif', font_scale=1, rc=None)
import pylab as pl

Populating the interactive namespace from numpy and matplotlib


In [4]:
df = pd.read_csv("201507-citibike-tripdata.csv")
df.head(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,732,7/1/2015 00:00:03,7/1/2015 00:12:16,489,10 Ave & W 28 St,40.750664,-74.001768,368,Carmine St & 6 Ave,40.730386,-74.00215,18669,Subscriber,1970.0,1
1,322,7/1/2015 00:00:06,7/1/2015 00:05:29,304,Broadway & Battery Pl,40.704633,-74.013617,3002,South End Ave & Liberty St,40.711512,-74.015756,14618,Subscriber,1984.0,1
2,790,7/1/2015 00:00:17,7/1/2015 00:13:28,447,8 Ave & W 52 St,40.763707,-73.985162,358,Christopher St & Greenwich St,40.732916,-74.007114,18801,Subscriber,1992.0,1


In [5]:
# Create a new dataframe (for zipcode) for the latitude and longitude data.
# keep the station id for later stage to merge the dataframes.
df_cordinate = pd.DataFrame()
for id in df['start station id'].unique():
    df_cordinate.loc[id,'latitude'] = float(df[df['start station id'] == id]['start station latitude'].unique())
    df_cordinate.loc[id,'longitude'] = float(df[df['start station id'] == id]['start station longitude'].unique())

df_cordinate.reset_index(inplace=True)
df_cordinate.head(3)

Unnamed: 0,index,latitude,longitude
0,489,40.750664,-74.001768
1,304,40.704633,-74.013617
2,447,40.763707,-73.985162


In [13]:
# Using reverse geocoding and using geocoder ad the zipcode to the dataframe. 
for i in range(len(df_cordinate)):
    g = geocoder.google([df_cordinate.latitude[i],df_cordinate.longitude[i]], method='reverse')
    df_cordinate.loc[i,'Zipcode'] = g.postal

In [14]:
df_cordinate.head(3)

Unnamed: 0,index,latitude,longitude,Zipcode
0,489,40.750664,-74.001768,10001
1,304,40.704633,-74.013617,10004
2,447,40.763707,-73.985162,10019


In [15]:
df_cordinate.rename(columns={"index": "start station id"}, inplace=True)
df_cordinate.head(3)

Unnamed: 0,start station id,latitude,longitude,Zipcode
0,489,40.750664,-74.001768,10001
1,304,40.704633,-74.013617,10004
2,447,40.763707,-73.985162,10019


In [16]:
#merge the dataframes
df = pd.merge(df,df_cordinate,on='start station id')

In [18]:
# Drop all rows where we do not have a birth year
df.dropna(inplace=True, subset=['birth year'])
# Generate age column for Subscriber
df['age'] = 2016 - df['birth year'][(df['usertype'] == 'Subscriber')]
df.head(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,latitude,longitude,Zipcode,age
0,732,7/1/2015 00:00:03,7/1/2015 00:12:16,489,10 Ave & W 28 St,40.750664,-74.001768,368,Carmine St & 6 Ave,40.730386,-74.00215,18669,Subscriber,1970.0,1,40.750664,-74.001768,10001,46.0
1,245,7/1/2015 06:22:12,7/1/2015 06:26:18,489,10 Ave & W 28 St,40.750664,-74.001768,434,9 Ave & W 18 St,40.743174,-74.003664,21242,Subscriber,1984.0,1,40.750664,-74.001768,10001,32.0
2,481,7/1/2015 06:53:35,7/1/2015 07:01:36,489,10 Ave & W 28 St,40.750664,-74.001768,491,E 24 St & Park Ave S,40.740964,-73.986022,20811,Subscriber,1983.0,1,40.750664,-74.001768,10001,33.0


In [19]:
# Drop unneeded columns
df.drop(['tripduration',
         'starttime', 
         'stoptime', 
         'start station id', 
         'start station name', 
         'start station latitude',
         'start station longitude',
         'end station id', 
         'end station name', 
         'end station latitude',
         'end station longitude',
         'bikeid',
         'usertype'], axis=1, inplace=True)
df.head(3)

Unnamed: 0,birth year,gender,latitude,longitude,Zipcode,age
0,1970.0,1,40.750664,-74.001768,10001,46.0
1,1984.0,1,40.750664,-74.001768,10001,32.0
2,1983.0,1,40.750664,-74.001768,10001,33.0


In [20]:
# Riders > 80 years are discarded for this study as it mighe account for an outliners or some data issues.
df = df[df['age'] < 80]

In [21]:
#Get the zipcodes by boroughs used the website: "http://www.citidex.com/map/zipco.html" and add them to list

brooklyn_zips = [11212, 11213, 11216, 11233, 11238, 11209, 11214, 11228, 11204, 11218, 11219, 11230, 11234, 11236, 11239 ,
                 11223, 11224, 11229, 11235, 11201, 11205, 11215, 11217, 11231, 11203, 11210, 11225, 11226, 11207, 11208,
                11211, 11222, 11220, 11232, 11206, 11221, 11237]

bronx_zips = [10453, 10457, 10460, 10458, 10467, 10468, 10451, 10452, 10456, 10454, 10455, 10459, 10474, 10463, 10471,
             10466, 10469, 10470, 10475, 10461, 10462,10464, 10465, 10472, 10473]

queens_zips = [11361, 11362, 11363, 11364, 11354, 11355, 11356, 11357, 11358, 11359, 11360, 11365, 11366, 11367, 11412, 
               11423, 11432, 11433, 11434, 11435, 11436, 11101, 11102, 11103, 11104, 11105, 11106, 11374, 11375, 11379,
               11385, 11691, 11692, 11693, 11694, 11695, 11697, 11004, 11005, 11411, 11413, 11422, 11426, 11427, 11428, 
               11429, 11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421, 11368, 11369, 11370, 11372, 11373, 11377, 
               11378]

staten_zips = [10302, 10303, 10310, 10306, 10307, 10308, 10309, 10312, 10301, 10304, 10305, 10314]

upper_man_zips = [10026, 10027, 10030, 10037, 10039, 10029, 10035, 10031, 10032, 10033, 10034, 10040, 
                 10021, 10028, 10044, 10065, 10075, 10128, 10023, 10024, 10025]

lower_man_zips = [10001, 10011, 10018, 10019, 10020, 10036, 10010, 10016, 10017, 10022, 10012, 10013, 10014,
                 10004, 10005, 10006, 10007, 10038, 10280, 10002, 10003, 10009]

In [None]:
brooklyn_ages = []
bronx_ages =[]
upper_man_ages =[]
lower_man_ages =[]
queens_ages = []
staten_ages = []

for i in range(len(df)):

    for zip in brooklyn_zips:
        brooklyn_ages.extend(list(df[df['Zipcode'] == zip]['age']))

    for zip in bronx_zips:
        bronx_ages.extend(list(df[df['Zipcode'] == zip]['age']))

    for zip in queens_zips:
        queens_ages.extend(list(df[df['Zipcode'] == zip]['age']))

    for zip in staten_zips:
        staten_ages.extend(list(df[df['Zipcode'] == zip]['age']))

    for zip in upper_man_zips:
        upper_man_ages.extend(list(df[df['Zipcode'] == zip]['age']))

    for zip in lower_man_zips:
        lower_man_ages.extend(list(df[df['Zipcode'] == zip]['age']))

In [None]:
brooklyn_ages

In [23]:
# Plot Broklyn rider histogram if there are riders in Broklyn
if len(brooklyn_ages) >= 1:
    bins = np.arange(min(brooklyn_ages), max(brooklyn_ages), 1)
    plt.hist(brooklyn_ages, bins, label='Count')
    plt.title('Citibike Riders Age Trip Frequency in Brooklyn')
    plt.xlabel('Age')
    plt.ylabel('Trip Frequency')
    plt.legend()
    plt.show()

Next plot the age distribution by borough and test the goodness of fit as well as test the hypothesis using the KS and AD tests.
I am unable to complete this due to time constrainst. But once I complete it and I would like to have the feedback for it. 