# Setting up the notebook

In [2]:
import os
import pandas as pd

# Create all data cleaning processing steps in one DF and batch process
# the rest on the server or cloud.

# Set WD to location of data
os.chdir('C:/Users/riley/Documents/Coding/DSC/lyft_no_data/')

# Load the one data file to practice on
dev_df = pd.read_csv('201801-fordgobike-tripdata.csv')

Wall time: 0 ns


# Data Cleaning and Feature Engineering

In [3]:
import numpy as np
import datetime

# Establishing the following data cleaning protocols in this test environment for this df and then will apply to master
# on server or cloud if needed but it shouldn't be too computationally
# taxing either way.

# Get min from seconds column
dev_df['duration_min'] = np.round(dev_df['duration_sec'] / 60, 0)

# Will now interpolate the member_birth_year with the mean birth year
# Also, I'm gonna get their age by subtracting now() from their birth year
dev_df['member_birth_year'].fillna(np.mean(dev_df['member_birth_year']), inplace=True)

# That got years as floats so I'm changing it to int:
dev_df['member_birth_year'] = dev_df['member_birth_year'].apply(np.int64)

# Now let's get the current year and subtract that from birth year to get age.
now = datetime.datetime.now()
dev_df['age'] = now.year - dev_df['member_birth_year']

Wall time: 0 ns


In [4]:
# Re-order the columns so the time information is at the beginning
dev_df = dev_df[['duration_sec', 'duration_min', 'start_time', 'end_time', 'start_station_id',
       'start_station_name', 'start_station_latitude',
       'start_station_longitude', 'end_station_id', 'end_station_name',
       'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
       'member_birth_year', 'member_gender', 'bike_share_for_all_trip']]

In [6]:
# list comprehension on the start_time column to get datetime
from dateutil import parser

# This runs a bit slowly (12 S for 90,000 rows) - look for alternatives
dev_df['start_time'] = [parser.parse(timestring) for timestring in dev_df['start_time']];

UsageError: Line magic function `%%time` not found.


In [16]:
# list comprehension and apply both are slow for parser, is there a faster method?
from dateutil import parser

# Same slowness as above
dev_df['end_time'] = dev_df['end_time'].apply(lambda x: parser.parse(x));

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [7]:
# Encode the different categorical genders to integers
genders = {'Male': 0, 'Female':1, 'Other':2}

# Use replace to replace each key in genders dict with corresponding value
dev_df['member_gender'].replace(genders, inplace=True);

In [None]:
############## SLOW - do not run yet ####################

# list comprehensions for the customer and subscriber costs

import math as m

# Calculates the cost of the ride based on two distinct rider types:
# Customers - who are charged $2 for rides under 30 min, and an additional
# $3 per 15 min.
# Subscribers - charged nothing up to 45 min per ride, and an additional
# $3 per 15 min

# The logic is the same, subtract 30 (or 45 min), take the larger number
# and ceiling it to get a scalar representing the number of 15 min
# increments, then multiply by the price of each additional 15 min
# increment

customer_costs = [2 + m.ceil(max((minute - 30), 0) / 15) * 3 for minute in dev_df['duration_min'] if np.where(dev_df['user_type'] == 'Customer')]
subscriber_costs = [m.ceil(max((minute - 45), 0) / 15) * 3 for minute in dev_df['duration_min'] if np.where(dev_df['user_type'] == 'Subscriber')]

In [None]:
# For calculating distance: is haversine or euclidian good enough? (80/20)
# Does it matter? What does it accomplish?

# A. With distance of ride, you can calculate cost per mile (based on avg.
# speed traveled by riders)

# B. You can get avg. and total distance covered by bikes, which can help
# determine how bikes improve/enable mobility.

In [None]:
# USE groupby and unstack to do multiiindexed plotting! Like gender and age and average ride length for example