- Import dependencies

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

- Load in citibike data from may 2018

In [2]:
df = pd.read_csv('201805-citibike-tripdata.csv')
df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,name_localizedValue0,usertype,birth year,gender
0,303,2018-05-01 00:00:11,2018-05-01 00:05:15,491,E 24 St & Park Ave S,40.740964,-73.986022,2003,1 Ave & E 18 St,40.733812,-73.980544,32191,$25 Off Annual Membership,Subscriber,1984.0,1
1,700,2018-05-01 00:00:36,2018-05-01 00:12:16,507,E 25 St & 2 Ave,40.739126,-73.979738,3458,W 55 St & 6 Ave,40.763094,-73.97835,29357,Annual Membership,Subscriber,1996.0,1
2,443,2018-05-01 00:00:40,2018-05-01 00:08:04,3263,Cooper Square & Astor Pl,40.729515,-73.990753,546,E 30 St & Park Ave S,40.744449,-73.983035,27255,Annual Membership,Subscriber,1969.0,1
3,297,2018-05-01 00:00:43,2018-05-01 00:05:41,532,S 5 Pl & S 5 St,40.710451,-73.960876,3096,Union Ave & N 12 St,40.71924,-73.95242,16980,Annual Membership,Subscriber,1985.0,1
4,421,2018-05-01 00:01:01,2018-05-01 00:08:02,3493,E 118 St & 3 Ave,40.799139,-73.938915,3351,E 102 St & 1 Ave,40.786995,-73.941648,30645,Annual Membership,Subscriber,1986.0,1


- Keep only subscriber information
- Remove unspecified gender from entries
- Later in the project, we found 10 rides logged in downtown Montreal - out of ~1.5 million entries. Let's cut these out with a simple expression for latitude.
- Drop NAs

In [8]:
df = df[df['usertype'] == "Subscriber"]
df = df[df['gender']!=0]
df = df[(df['start station latitude'] < 41.5) & (df['end station latitude'] < 41.5)]
df = df.dropna()
df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,name_localizedValue0,usertype,birth year,gender
0,303,2018-05-01 00:00:11,2018-05-01 00:05:15,491,E 24 St & Park Ave S,40.740964,-73.986022,2003,1 Ave & E 18 St,40.733812,-73.980544,32191,$25 Off Annual Membership,Subscriber,1984.0,1
1,700,2018-05-01 00:00:36,2018-05-01 00:12:16,507,E 25 St & 2 Ave,40.739126,-73.979738,3458,W 55 St & 6 Ave,40.763094,-73.97835,29357,Annual Membership,Subscriber,1996.0,1
2,443,2018-05-01 00:00:40,2018-05-01 00:08:04,3263,Cooper Square & Astor Pl,40.729515,-73.990753,546,E 30 St & Park Ave S,40.744449,-73.983035,27255,Annual Membership,Subscriber,1969.0,1
3,297,2018-05-01 00:00:43,2018-05-01 00:05:41,532,S 5 Pl & S 5 St,40.710451,-73.960876,3096,Union Ave & N 12 St,40.71924,-73.95242,16980,Annual Membership,Subscriber,1985.0,1
4,421,2018-05-01 00:01:01,2018-05-01 00:08:02,3493,E 118 St & 3 Ave,40.799139,-73.938915,3351,E 102 St & 1 Ave,40.786995,-73.941648,30645,Annual Membership,Subscriber,1986.0,1


- Parse start time to extract hour of day (on 24-hour scale)
- Get weekday to make weekend dummy variable (weekend or weekday)

In [9]:
from datetime import *
import time

hour = []
weekend = []
for start_time in df['starttime']:
    t1 = datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S')
    hour.append(t1.hour)
    if t1.weekday() in [5,6]:
        weekend.append(1)
    else:
        weekend.append(0)
  

- Compile relevant data into a new, clean dataframe

In [10]:
clean_df = pd.DataFrame({
    "duration":df['tripduration'],\
    "weekend":weekend,\
    "hour":hour,\
    "start_lat":df['start station latitude'],\
    "start_long":df['start station longitude'],\
    "end_lat":df['end station latitude'],\
    "end_long":df['end station longitude'],\
    "gender":(df['gender']-1),\
    "age":(2018 - df['birth year'])
})
clean_df.head()

Unnamed: 0,age,duration,end_lat,end_long,gender,hour,start_lat,start_long,weekend
0,34.0,303,40.733812,-73.980544,0,0,40.740964,-73.986022,0
1,22.0,700,40.763094,-73.97835,0,0,40.739126,-73.979738,0
2,49.0,443,40.744449,-73.983035,0,0,40.729515,-73.990753,0
3,33.0,297,40.71924,-73.95242,0,0,40.710451,-73.960876,0
4,32.0,421,40.786995,-73.941648,0,0,40.799139,-73.938915,0


- Create dummy variable for whether rider is in 20s

In [11]:
twenties = []
for val in clean_df['age']:
    if val < 30 and val > 19:
        twenties.append(1)
    else:
        twenties.append(0)
        
clean_df['twenties'] = twenties
clean_df.head()

Unnamed: 0,age,duration,end_lat,end_long,gender,hour,start_lat,start_long,weekend,twenties
0,34.0,303,40.733812,-73.980544,0,0,40.740964,-73.986022,0,0
1,22.0,700,40.763094,-73.97835,0,0,40.739126,-73.979738,0,1
2,49.0,443,40.744449,-73.983035,0,0,40.729515,-73.990753,0,0
3,33.0,297,40.71924,-73.95242,0,0,40.710451,-73.960876,0,0
4,32.0,421,40.786995,-73.941648,0,0,40.799139,-73.938915,0,0


- Save data to new CSV

In [12]:
clean_df.to_csv('cleaned_bike_data.csv')