# Setting up the notebook

In [125]:
%time
import os
import pandas as pd

# Create all data cleaning processing steps in one DF and batch process
# the rest on the server or cloud.

os.chdir('C:/Users/riley/Documents/Coding/DSC/lyft_no_data/')


dev_df = pd.read_csv('201801-fordgobike-tripdata.csv')

Wall time: 0 ns


# Data Cleaning and Feature Engineering

In [126]:
%time

import numpy as np
import datetime

# Establishing the following data cleaning protocols in this test environment for this df and then will apply to master
# on server or cloud if needed but it shouldn't be too computationally
# taxing either way.

# Get min from seconds column
dev_df['duration_min'] = np.round(dev_df['duration_sec'] / 60, 0)

# Will now interpolate the member_birth_year with the mean birth year
# Also, I'm gonna get their age by subtracting now() from their birth year

dev_df['member_birth_year'].fillna(np.mean(dev_df['member_birth_year']), inplace=True)

# That got years as floats so I'm changing it to int:
dev_df['member_birth_year'] = dev_df['member_birth_year'].apply(np.int64)

# Now let's get the current year and subtract that from birth year to get age.

now = datetime.datetime.now()

dev_df['age'] = now.year - dev_df['member_birth_year']

# THIS DIDN"T WORK:
# More efficient way computationally to encode categorical gender to numerical
dev_df['member_gender'] = dev_df.member_gender.map(dict(Male=0, Female=1, Other=2))

Wall time: 0 ns


In [127]:
# Re-order the columns so the time information is at the beginning

dev_df = dev_df[['duration_sec', 'duration_min', 'start_time', 'end_time', 'start_station_id',
       'start_station_name', 'start_station_latitude',
       'start_station_longitude', 'end_station_id', 'end_station_name',
       'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
       'member_birth_year', 'member_gender', 'bike_share_for_all_trip']]

In [128]:
# list comprehension on the start_time column to get datetime

from dateutil import parser

dev_df['start_time'] = [parser.parse(timestring) for timestring in dev_df['start_time']]

In [129]:
# list comprehension and apply both are slow for parser, is there a faster method?

from dateutil import parser

dev_df['end_time'] = dev_df['end_time'].apply(lambda x: parser.parse(x))

In [130]:
dev_df.head()

Unnamed: 0,duration_sec,duration_min,start_time,end_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,member_birth_year,member_gender,bike_share_for_all_trip
0,75284,1255.0,2018-01-31 22:52:35.239,2018-02-01 19:47:19.824,120,Mission Dolores Park,37.76142,-122.426435,285,Webster St at O'Farrell St,37.783521,-122.431158,2765,Subscriber,1986,0.0,No
1,85422,1424.0,2018-01-31 16:13:34.351,2018-02-01 15:57:17.310,15,San Francisco Ferry Building (Harry Bridges Pl...,37.795392,-122.394203,15,San Francisco Ferry Building (Harry Bridges Pl...,37.795392,-122.394203,2815,Customer,1980,,No
2,71576,1193.0,2018-01-31 14:23:55.889,2018-02-01 10:16:52.116,304,Jackson St at 5th St,37.348759,-121.894798,296,5th St at Virginia St,37.325998,-121.87712,3039,Customer,1996,0.0,No
3,61076,1018.0,2018-01-31 14:53:23.562,2018-02-01 07:51:20.500,75,Market St at Franklin St,37.773793,-122.421239,47,4th St at Harrison St,37.780955,-122.399749,321,Customer,1980,,No
4,39966,666.0,2018-01-31 19:52:24.667,2018-02-01 06:58:31.053,74,Laguna St at Hayes St,37.776435,-122.426244,19,Post St at Kearny St,37.788975,-122.403452,617,Subscriber,1991,0.0,No


In [142]:
dev_df['member_gender'].value_counts()

0.0    65508
1.0    20298
2.0     1195
Name: member_gender, dtype: int64

In [143]:
test_df = pd.read_csv('201801-fordgobike-tripdata.csv')

test_df['member_gender'].value_counts()

Male      65508
Female    20298
Other      1195
Name: member_gender, dtype: int64

In [144]:
dev_df.tail()

Unnamed: 0,duration_sec,duration_min,start_time,end_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,member_birth_year,member_gender,bike_share_for_all_trip
94797,695,12.0,2018-01-01 00:19:58.761,2018-01-01 00:31:33.832,23,The Embarcadero at Steuart St,37.791464,-122.391034,66,3rd St at Townsend St,37.778742,-122.392741,3671,Customer,1980,,No
94798,600,10.0,2018-01-01 00:19:48.761,2018-01-01 00:29:49.074,17,Embarcadero BART Station (Beale St at Market St),37.792251,-122.397086,19,Post St at Kearny St,37.788975,-122.403452,603,Customer,1980,,No
94799,1151,19.0,2018-01-01 00:09:31.745,2018-01-01 00:28:43.159,97,14th St at Mission St,37.768265,-122.42011,125,20th St at Bryant St,37.7592,-122.409851,3455,Subscriber,1992,0.0,No
94800,714,12.0,2018-01-01 00:07:52.943,2018-01-01 00:19:47.075,74,Laguna St at Hayes St,37.776435,-122.426244,70,Central Ave at Fell St,37.773311,-122.444293,2423,Subscriber,1986,0.0,No
94801,145,2.0,2018-01-01 00:07:41.040,2018-01-01 00:10:06.241,316,San Salvador St at 1st St,37.330165,-121.885831,311,Paseo De San Antonio at 2nd St,37.333798,-121.886943,2473,Subscriber,1957,0.0,No


In [None]:
# For calculating distance: can you calc the haversine distance or euclidian distance of the start and end points of the rides?
# What about Eric's idea about querying Google Maps API and pulling the distances from there? What does that API look like
# to pull from in terms of in Python

# YOU COULD GET THE COST PER MILE FOR THE AVG. BIKE RIDE TO TELL USERS HOW MUCH IT WILL COST THEM TO RIDE TO X STOP

In [None]:
# USE groupby and unstack to do multiiindexed plotting! Like gender and age and average ride length for example