In [1]:
import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS
import numpy as np
from matplotlib import pyplot as plt

In [2]:
df = pd.read_parquet('2 Data/Prepped Data/nyc_data.parquet')

In [3]:
df.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,avgTemp,trips
0,electric_bike,2022-08-27 13:56:47.728,2022-08-27 14:02:56.651,Flatbush Ave & Ocean Ave,3704.04,3 St & Prospect Park West,3865.05,40.663658,-73.963013,40.668133,-73.97364,casual,2022-08-27,27.799999,1
1,electric_bike,2022-08-20 10:37:02.756,2022-08-20 10:45:56.631,Forsyth St\t& Grand St,5382.07,E 11 St & 1 Ave,5746.14,40.717796,-73.993164,40.729538,-73.984268,casual,2022-08-20,27.9,1
2,classic_bike,2022-08-31 18:55:03.051,2022-08-31 19:03:37.344,Perry St & Bleecker St,5922.07,Grand St & Greene St,5500.02,40.735355,-74.004829,40.721699,-74.00238,member,2022-08-31,25.6,1
3,classic_bike,2022-08-02 08:05:00.250,2022-08-02 08:16:52.063,FDR Drive & E 35 St,6230.04,Grand Army Plaza & Central Park S,6839.1,40.744221,-73.971214,40.764397,-73.973717,member,2022-08-02,26.4,1
4,electric_bike,2022-08-25 15:44:48.386,2022-08-25 15:55:39.691,E 40 St & 5 Ave,6474.11,Ave A & E 14 St,5779.11,40.752052,-73.982117,40.730312,-73.980469,member,2022-08-25,28.1,1


In [4]:
df.dtypes

rideable_type               category
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            string
start_station_id            category
end_station_name              string
end_station_id              category
start_lat                    float32
start_lng                    float32
end_lat                      float32
end_lng                      float32
member_casual               category
date                  datetime64[ns]
avgTemp                      float32
trips                          int64
dtype: object

In [5]:
# Group by start and end stations

df_group = df.groupby(['start_station_name', 'end_station_name'])['trips'].count().reset_index()

In [6]:
df_group

Unnamed: 0,start_station_name,end_station_name,trips
0,1 Ave & E 110 St,1 Ave & E 110 St,791
1,1 Ave & E 110 St,1 Ave & E 18 St,2
2,1 Ave & E 110 St,1 Ave & E 30 St,4
3,1 Ave & E 110 St,1 Ave & E 39 St,1
4,1 Ave & E 110 St,1 Ave & E 44 St,12
...,...,...,...
1020345,York St & Marin Blvd,Van Vorst Park,18
1020346,York St & Marin Blvd,Warren St,42
1020347,York St & Marin Blvd,Washington St,16
1020348,York St & Marin Blvd,Willow Ave & 12 St,1


In [7]:
# Merge lat/lng columns from df with df_group

columns_to_merge = ['start_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng']
location_df = df[columns_to_merge].drop_duplicates(subset = 'start_station_name')

In [8]:
df_m = df_group.merge(location_df, on='start_station_name', how='left', indicator = 'merge_flag')

In [9]:
df_m.head()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng,merge_flag
0,1 Ave & E 110 St,1 Ave & E 110 St,791,40.792339,-73.93824,40.792328,-73.938301,both
1,1 Ave & E 110 St,1 Ave & E 18 St,2,40.792339,-73.93824,40.792328,-73.938301,both
2,1 Ave & E 110 St,1 Ave & E 30 St,4,40.792339,-73.93824,40.792328,-73.938301,both
3,1 Ave & E 110 St,1 Ave & E 39 St,1,40.792339,-73.93824,40.792328,-73.938301,both
4,1 Ave & E 110 St,1 Ave & E 44 St,12,40.792339,-73.93824,40.792328,-73.938301,both


In [10]:
# Change column names

df_m.rename(columns={
    'start_lng': 'start_lon',
    'end_lng': 'end_lon'
}, inplace=True)

In [11]:
df_m.shape

(1020350, 8)

In [12]:
df_m['merge_flag'].value_counts(dropna = False)

both          1020350
left_only           0
right_only          0
Name: merge_flag, dtype: int64

In [13]:
df_m.drop(columns = 'merge_flag', inplace = True)

In [14]:
df_m.columns

Index(['start_station_name', 'end_station_name', 'trips', 'start_lat',
       'start_lon', 'end_lat', 'end_lon'],
      dtype='object')

In [15]:
import gc
gc.collect()

0

## Standardize lat/long coordinates

In [16]:
# Group by start station and count distinct lat/lng combinations
start_var_coords = (
    df_m
    .dropna(subset=['start_station_name', 'start_lat', 'start_lon'])
    .groupby('start_station_name')[['start_lat', 'start_lon']]
    .nunique()
)

# Filter to show stations with more than one lat or lng value
start_var_coords = start_var_coords[
    (start_var_coords['start_lat'] > 1) | (start_var_coords['start_lon'] > 1)
]

print(start_var_coords.sort_values(['start_lat', 'start_lon'], ascending=False))

Empty DataFrame
Columns: [start_lat, start_lon]
Index: []


In [17]:
# Same as above for end stations
end_var_coords = (
    df_m
    .dropna(subset=['end_station_name', 'end_lat', 'end_lon'])
    .groupby('end_station_name')[['end_lat', 'end_lon']]
    .nunique()
)

end_var_coords = end_var_coords[
    (end_var_coords['end_lat'] > 1) | (end_var_coords['end_lon'] > 1)
]

print(end_var_coords.sort_values(['end_lat', 'end_lon'], ascending=False))

                          end_lat  end_lon
end_station_name                          
Cleveland Pl & Spring St      124      123
1 Ave & E 68 St               122      121
N 6 St & Bedford Ave          119      118
Broadway & W 58 St            118      118
Kent Ave & N 7 St             117      116
...                           ...      ...
E 6 St 2 Ave                    5        5
Shop Morgan                     4        4
Lab - NYC                       3        3
Sharon St & Olive St_new        3        3
JCBS Depot                      2        2

[1847 rows x 2 columns]


In [18]:
# Use the most frequent lat/lon pair per station
# Build a lookup table of the most frequent lat/lng per start_station_name
start_coords_mode = (
    df_m
    .dropna(subset=['start_station_name', 'start_lat', 'start_lon'])
    .groupby(['start_station_name', 'start_lat', 'start_lon'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
    .drop_duplicates('start_station_name')
    .drop(columns='count')
)

In [19]:
# Update df_m with standardized values
# Drop current start lat/lon and re-merge standard values

df_m = df_m.drop(columns=['start_lat', 'start_lon'])

df_m = df_m.merge(
    start_coords_mode,
    on='start_station_name',
    how='left'
)

In [20]:
# Use the most frequent lat/lon pair per station
# Build a lookup table of the most frequent lat/lng per end_station_name
end_coords_mode = (
    df_m
    .dropna(subset=['end_station_name', 'end_lat', 'end_lon'])
    .groupby(['end_station_name', 'end_lat', 'end_lon'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
    .drop_duplicates('end_station_name')
    .drop(columns='count')
)

In [21]:
# Update df_m with standardized values
# Drop current end lat/lon and re-merge standard values

df_m = df_m.drop(columns=['end_lat', 'end_lon'])

df_m = df_m.merge(
    end_coords_mode,
    on='end_station_name',
    how='left'
)

In [22]:
# Check start lat/lon

df_m[['start_lat', 'start_lon']].isna().sum()

start_lat    0
start_lon    0
dtype: int64

In [23]:
# Check end lat/lon

df_m[['end_lat', 'end_lon']].isna().sum()

end_lat    0
end_lon    0
dtype: int64

In [24]:
df_m.head()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lon,end_lat,end_lon
0,1 Ave & E 110 St,1 Ave & E 110 St,791,40.792339,-73.93824,40.744877,-73.9953
1,1 Ave & E 110 St,1 Ave & E 18 St,2,40.792339,-73.93824,40.744877,-73.9953
2,1 Ave & E 110 St,1 Ave & E 30 St,4,40.792339,-73.93824,40.744877,-73.9953
3,1 Ave & E 110 St,1 Ave & E 39 St,1,40.792339,-73.93824,40.744877,-73.9953
4,1 Ave & E 110 St,1 Ave & E 44 St,12,40.792339,-73.93824,40.744877,-73.9953


In [25]:
print(df_m['trips'].max())

12041


In [26]:
# Save merged dataset

df_m.to_parquet('2 Data/Prepped Data/nyc_merged.parquet', index=False)

In [27]:
df_m.dtypes

start_station_name     string
end_station_name       string
trips                   int64
start_lat             float64
start_lon             float64
end_lat               float64
end_lon               float64
dtype: object

In [28]:
df_m['trips'] = df_m['trips'].astype(float)

In [35]:
df_m.shape

(1020350, 7)

In [39]:
df_reduced = df_m[df_m['trips'] > 500]

In [40]:
df_reduced.head()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lon,end_lat,end_lon
0,1 Ave & E 110 St,1 Ave & E 110 St,791.0,40.792339,-73.93824,40.744877,-73.9953
25,1 Ave & E 110 St,2 Ave & E 96 St,1355.0,40.792339,-73.93824,40.744877,-73.9953
228,1 Ave & E 110 St,E 114 St & 1 Ave,520.0,40.792339,-73.93824,40.744877,-73.9953
444,1 Ave & E 110 St,Lenox Ave & W 111 St,792.0,40.792339,-73.93824,40.744877,-73.9953
454,1 Ave & E 110 St,Lexington Ave & E 111 St,1259.0,40.792339,-73.93824,40.744877,-73.9953


In [41]:
df_reduced.shape

(7350, 7)

In [42]:
# Create KeplerGl instance

m = KeplerGl(height = 700)
m.add_data(data = df_reduced, name = 'NYC Trips')
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'NYC Trips': {'index': [0, 25, 228, 444, 454, 506, 713, 714, 715, 716, 742, 885, 916, 918, 1068…

#### I adjusted the colors of the map to mostly reflect those of the flag of New York City, which is blue, white, and orange. I would think New Yorkers would appreciate this color scheme. I used a light enough blue for the start points so that it can be distinguished from the background. The end points of the bike routes are the least featured color on the map. For those I used green, as it's a color easily distinguishable from the others used, and yet doesn't call too much attention away from things.

#### To get to that point, I changed the "Fill Color" in the respective "start" and "end" drop-down menus. These were originally set up as 5-stepped colors. But in order to focus on the more popular routes in the city, I reduced the stepped colors down to 1. 

#### To change the arc colors I changed the "Color" in the "start -> end arc" drop-down menu.

#### It's interesting that within the most common trips made, there are very few end-station points, and many start-station points. The most popular end points are near popular areas in, and actually outside the city. 

#### 11th and West 27th street is what looks like the most popular destination. This is near The High Line, which is an elevated park. Built on former railway tracks, it offers unique views of the city. Also nearby are: Chelsea Market, a popular food hall; Hudson Yards, large-scale development featuring the Vessel, a unique interactive art structure; The Museum of Modern Art (MoMA); Rockefeller Center. This makes sense, as it would be a popular for locals and tourists. 

#### Newport parkway, is actually in New Jersey, and offers views of the New York City skyline. Notable attractions nearby include Pier A Park, the Hoboken Waterfront, and Newport Centre Mall. The area is also known for its scenic walking areas and boat tours along the Hudson River. 

#### 4th and Jackson street is near Jackson Avenue Station, which is served by the 2 train at all hours. 

In [43]:
kepler_config = m.config

In [44]:
kepler_config

{'version': 'v1',
 'config': {'visState': {'filters': [{'dataId': ['NYC Trips'],
     'id': '6wnrs2906',
     'name': ['trips'],
     'type': 'range',
     'value': [1430, 12041],
     'plotType': 'histogram',
     'animationWindow': 'free',
     'yAxis': None,
     'view': 'side',
     'speed': 1,
     'enabled': True}],
   'layers': [{'id': 'jd3v456',
     'type': 'point',
     'config': {'dataId': 'NYC Trips',
      'label': 'start',
      'color': [41, 76, 181],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'lat': 'start_lat', 'lng': 'start_lon'},
      'isVisible': True,
      'visConfig': {'radius': 10,
       'fixedRadius': False,
       'opacity': 0.8,
       'outline': False,
       'thickness': 2,
       'strokeColor': None,
       'colorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},


In [45]:
import json
with open("config.json", "w") as outfile:
    json.dump(kepler_config, outfile)

In [46]:
m.save_to_html(file_name = 'Citibike Trips Aggregated.html', read_only = False, config = kepler_config)

Map saved to Citibike Trips Aggregated.html!
