<h1>Frequently Traveled To CTA Stations via Divvy</h1>

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # Plotting
import seaborn # Optional. Makes plots prettier
from mpl_toolkits.mplot3d import Axes3D # 3-d plotting

In [2]:
# Data file paths

divvy_trip_data_path = "../data/divvy/Divvy_Trips_2016_Q1Q2/" + \
    "Divvy_Trips_2016_Q1Q2/Divvy_Trips_2016_Q1.csv"

divvy_station_data_path = "../data/divvy/Divvy_Trips_2016_Q1Q2/" + \
    "Divvy_Trips_2016_Q1Q2/Divvy_Stations_2016_Q1Q2.csv"

cta_data_url = '''../data/CTA_-_System_Information_-_List_of__L__Stops.csv'''

In [3]:
# Create dataframe objects

cta_data = pd.read_csv(cta_data_url)
divvy_trip_data = pd.read_csv(divvy_trip_data_path)
divvy_station_data = pd.read_csv(divvy_station_data_path)

In [4]:
# Split cta_data coordinate information into latitude and longitude

cta_data["lat"], cta_data["lon"] = zip(*[i.strip('(').strip(')').split(",") 
                                         for i in cta_data.Location])

<h2>Goal</h2>
<p>Find nearest CTA station to each divvy_trip_data "to_station" within a certain radius.</p>


In [5]:
from math import sqrt
def euclid_distance(x1, x2, y1, y2):
    '''Euclidean distance formula'''
    coord_1 = (x1-y1)**2
    coord_2 = (x2-y2)**2
    return sqrt(coord_1 + coord_2)

# Algorithms for shrinking search parameters
def __get_small_start_point(x, y, lats_list, lons_list, start):
    '''Recursively find the smallest starting point'''
    if x > float(lats_list[start]) and y > float(lons_list[start]):
        start += 1
    else:
        return start

    return get_small_start_point(x,y,
        lats_list[start:], lons_list[start:], start)

def __get_small_end_point(x, y, lats_list, lons_list, start, end):
    '''Recursively find the smallest starting grid. Requires a starting point.'''
    if x < float(lats_list[end]) and y < float(lons_list[end]):
        end -= 1
    else:
        return end
    
    return get_small_end_point(x,y,
        lats_list[start:end], lons_list[start:end], start, end)


def __sort_data(divvy_station_data, cta_data):
    cta_data = cta_data.sort_values(['lat','lon'], ascending=[False, False])
    divvy_station_data = divvy_station_data.sort_values(['latitude','longitude'], 
                                                        ascending=[False, False])
    return divvy_station_data, cta_data


def find_nearest_cta_station_to_divvy(divvy_station_data, cta_station_data, original_min_distance=.005):
    '''Create a list of the nearest CTA stations to Divvy Stations within a certain radius (initially .005, which is about half a mile)'''
    # Initialize nearest El station with "Nones"
    nearest_station = [None]*len(divvy_station_data)
    
    # Sort data
    divvy_station_data, cta_station_data = __sort_data(divvy_station_data, cta_station_data)
    
    # Extract divvy station data into lists
    lats, lons, divvy_names = (list(divvy_station_data['latitude']), list(divvy_station_data['longitude']), 
                               list(divvy_station_data['name']))
    
    # Extract cta station data into lists
    cta_lats, cta_lons, cta_names = (list(cta_station_data['lat']), list(cta_station_data['lon']),
                                     list(cta_station_data['STATION_DESCRIPTIVE_NAME']))
    
    # Iterate through Divvy station data
    for n, (x,y,name) in enumerate(zip(lats, lons, divvy_names)):
        x,y = map(float, [x,y]) # Convert x and y to float types
        min_distance = original_min_distance

        start_index = 0
        end_index = len(cta_lats) - 1 # Ending index of cta_lats

        start_index = __get_small_start_point(x, y, cta_lats, cta_lons, start_index)
        end_index = __get_small_end_point(x, y, cta_lats, cta_lons, start_index, end_index)


        # Iterate through CTA stations to compare
        segments = zip(cta_lats[start_index:end_index], cta_lons[start_index:end_index], 
                       cta_names[start_index:end_index]) # Construct list of CTA data to sift through
        
        # Iterate through shrunken lists of CTA data
        for k, (a,b,cta_name) in enumerate(segments):
            a,b = map(float, [a,b]) # Convert x,y,a,b from strings to float types

            if euclid_distance(x,y,a,b) < min_distance:
                min_distance = euclid_distance(x,y,a,b)
                nearest_station[n] = cta_name
                
    return nearest_station

divvy_station_data['nearest_L'] = find_nearest_cta_station_to_divvy(divvy_station_data, cta_data) # Append to dataframe

In [6]:
# Create dictionaries to make columns associating 
    # Divvy stations w/ nearest L stations

station_nearest_L_dict = {k:L 
    for k,L in zip(divvy_station_data.id, divvy_station_data.nearest_L)}

cta_station_lat_lon_dict = {k : {'lat' : x, 'lon' : y} 
    for k,x,y in zip(cta_data.STATION_DESCRIPTIVE_NAME, cta_data.lat, cta_data.lon)}

In [18]:
# Append nears CTA station to & from Divvy stations


# Add nearest station name to "to" and "from" data
divvy_trip_data['nearest_to_L'] = [station_nearest_L_dict.get(i) 
    for i in divvy_trip_data.to_station_id]
divvy_trip_data['nearest_from_L'] = [station_nearest_L_dict.get(i)
    for i in divvy_trip_data.from_station_id]


# Add nearest station to Divvy "to" data
divvy_trip_data['nearest_to_L_lat'] = [cta_station_lat_lon_dict.get(i)['lat'] 
    if cta_station_lat_lon_dict.get(i) else None
    for i in divvy_trip_data.nearest_to_L]
divvy_trip_data['nearest_to_L_lon'] = [cta_station_lat_lon_dict.get(i)['lon'] 
    if cta_station_lat_lon_dict.get(i) else None
    for i in divvy_trip_data.nearest_to_L]


# Add nearest station to Diivvy "from" data
divvy_trip_data['nearest_from_L_lat'] = [cta_station_lat_lon_dict.get(i)['lat'] 
    if cta_station_lat_lon_dict.get(i) else None
   for i in divvy_trip_data.nearest_fom_L]
divvy_trip_data['nearest_from_L_lon'] = [cta_station_lat_lon_dict.get(i)['lon'] 
    if cta_station_lat_lon_dict.get(i) else None 
    for i in divvy_trip_data.nearest_from_L]

# Add coords to dataframe
divvy_trip_data['coords'] = [(i,k) 
    for i,k in zip(divvy_trip_data.nearest_to_L_lat, 
                   divvy_trip_data.nearest_to_L_lon)]
divvy_trip_data['from_coords'] = [(i,k) 
    for i,k in zip(divvy_trip_data.nearest_from_L_lat, 
                   divvy_trip_data.nearest_from_L_lon)]

In [19]:
# Let's see the product of this!

divvy_trip_data.head()

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,...,birthyear,nearest_to_L,nearest_fom_L,nearest_to_L_lat,nearest_to_L_lon,nearest_from_L_lat,nearest_from_L_lon,coords,from_coords,nearest_from_L
0,9080551,3/31/2016 23:53,4/1/2016 0:07,155,841,344,Ravenswood Ave & Lawrence Ave,458,Broadway & Thorndale Ave,Subscriber,...,1986,,,,,,,"(None, None)","(None, None)",
1,9080550,3/31/2016 23:46,3/31/2016 23:57,4831,649,128,Damen Ave & Chicago Ave,213,Leavitt St & North Ave,Subscriber,...,1980,Clinton (Green & Pink Lines),,41.885678,-87.641782,,,"(41.885678, -87.641782)","(None, None)",
2,9080549,3/31/2016 23:42,3/31/2016 23:46,4232,210,350,Ashland Ave & Chicago Ave,210,Ashland Ave & Division St,Subscriber,...,1979,Bryn Mawr (Red Line),Thorndale (Red Line),41.983504,-87.65884,41.990259,-87.659076,"(41.983504, -87.65884)","(41.990259, -87.659076)",Thorndale (Red Line)
3,9080548,3/31/2016 23:37,3/31/2016 23:55,3464,1045,303,Broadway & Cornelia Ave,458,Broadway & Thorndale Ave,Subscriber,...,1980,,Lawrence (Red Line),,,41.969139,-87.658493,"(None, None)","(41.969139, -87.658493)",Lawrence (Red Line)
4,9080547,3/31/2016 23:33,3/31/2016 23:37,1750,202,334,Lake Shore Dr & Belmont Ave,329,Lake Shore Dr & Diversey Pkwy,Subscriber,...,1969,"State/Lake (Brown, Green, Orange, Pink & Purpl...",Oak Park (Green Line),41.88574,-87.627835,41.886988,-87.793783,"(41.88574, -87.627835)","(41.886988, -87.793783)",Oak Park (Green Line)


In [None]:
# Generate 3-d map
# %matplotlib inline # <-- Optional line. Puts chart within notebook. If not called, will create new window with interactive 3d plot

# Get unique coordinates using set comprehension
coords = {i for i in zip(divvy_trip_data.nearest_to_L_lat, 
                         divvy_trip_data.nearest_to_L_lon)}
from_coords = coords = {i for i in zip(divvy_trip_data.nearest_from_L_lat, 
                         divvy_trip_data.nearest_from_L_lon)}

# Generate a list of those coordinates and the counts of those coordinate in the data
divvy_trip_coords = list(divvy_trip_data.coords)
divvy_from_trip_coords = list(divvy_trip_data.from_coords)

values = [(coord, divvy_trip_coords.count(coord)) 
          for coord in coords
          if coord[0] != None]
from_values  = [(coord, divvy_from_trip_coords.count(coord)) 
                  for coord in from_coords
                  if coord[0] != None]


x = [float(i[0][0]) for i in values] # L station X
y = [float(i[0][1]) for i in values] # L station Y
counts = [i[1] for i in values] # L station popularity

from_x = [float(i[0][0]) for i in from_values] # L station X
from_y = [float(i[0][1]) for i in from_values] # L station Y
from_counts = [i[1] for i in from_values] # L station popularity

# Plot it!
fig = plt.figure(figsize=(18,10)) # Create figure object of size 10x10
ax1 = fig.add_subplot(111, projection='3d') # Create axis within figure object and utilize 3-d projection

xpos = y # Lat and lon flipped. It's confusing. Sry.
ypos = x
num_elements = len(xpos)
zpos = [0]*num_elements # Initial 3-d position
dx = [.002]*num_elements # X transformation
dy = [.002]*num_elements # Y transformation . Lower numbers decrease base of bars
dz = counts # Z transformation "Height" of the bars

ax1.bar3d(xpos, ypos, zpos, dx, dy, dz, alpha=1) # Plot into axis object


# Plot "From" data as well in orange
xpos = from_y # Lat and lon flipped. It's confusing. Sry.
ypos = from_x
num_elements = len(xpos)
zpos = [0]*num_elements # Initial 3-d position
dx = [.002]*num_elements # X transformation
dy = [.002]*num_elements # Y transformation . Lower numbers decrease base of bars
dz = from_counts # Z transformation "Height" of the bars

from_stations = ax1.bar3d(xpos, ypos, zpos, dx, dy, dz, alpha=.50, color="orange", label="From Trips") # Plot into axis object

plt.title("Frequently Traveled To CTA Stations by Divvy Users", size=18, weight="bold") # Add a title to it
plt.show() # Show it!

In [None]:
# In text form

for k,n in divvy_trip_data.nearest_to_L.value_counts().items():
    print(k,": ",n)