In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
from streamlit_keplergl import keplergl_static

In [2]:
df = pd.read_parquet('2 Data/Prepped Data/nyc_data.parquet')

In [3]:
# Create bike_rides_daily column

df['bike_rides_daily'] = df.groupby('date')['trips'].transform('sum')

In [4]:
# Create a month column 

df['month'] = df['date'].dt.month
df['month'] = df['month'].astype('int')

In [5]:
# Create the season column

df['season'] = [
"winter" if (month == 12 or 1 <= month <= 2)
    else "spring" if (3 <= month <= 5)
    else "summer" if (6 <= month <= 8)
    else "fall"
for month in df['month']
    ]

In [6]:
# Convert avgTemp from Celsius to Fahrenheit

df['avgTemp_F'] = (df['avgTemp'] * 9/5) + 32

In [7]:
# Drop the original Celsius temperature column

df.drop(columns=['avgTemp'], inplace=True)

In [8]:
# Rename avgTemp_F

df.rename(columns = {'avgTemp_F' : 'avg_temp'}, inplace = True)

In [9]:
# Group by start_station_name and sum the 'trips'

df['trips_per_start_station'] = df.groupby('start_station_name')['trips'].transform('sum')

In [10]:
df.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,trips,bike_rides_daily,month,season,avg_temp,trips_per_start_station
0,electric_bike,2022-08-27 13:56:47.728,2022-08-27 14:02:56.651,Flatbush Ave & Ocean Ave,3704.04,3 St & Prospect Park West,3865.05,40.663658,-73.963013,40.668133,-73.97364,casual,2022-08-27,1,121496,8,summer,82.040001,29876
1,electric_bike,2022-08-20 10:37:02.756,2022-08-20 10:45:56.631,Forsyth St\t& Grand St,5382.07,E 11 St & 1 Ave,5746.14,40.717796,-73.993164,40.729538,-73.984268,casual,2022-08-20,1,132339,8,summer,82.220001,52973
2,classic_bike,2022-08-31 18:55:03.051,2022-08-31 19:03:37.344,Perry St & Bleecker St,5922.07,Grand St & Greene St,5500.02,40.735355,-74.004829,40.721699,-74.00238,member,2022-08-31,1,126827,8,summer,78.080002,38894
3,classic_bike,2022-08-02 08:05:00.250,2022-08-02 08:16:52.063,FDR Drive & E 35 St,6230.04,Grand Army Plaza & Central Park S,6839.1,40.744221,-73.971214,40.764397,-73.973717,member,2022-08-02,1,124176,8,summer,79.519997,51933
4,electric_bike,2022-08-25 15:44:48.386,2022-08-25 15:55:39.691,E 40 St & 5 Ave,6474.11,Ave A & E 14 St,5779.11,40.752052,-73.982117,40.730312,-73.980469,member,2022-08-25,1,123775,8,summer,82.580002,35239


In [11]:
# Calculate the total number of trips for each station by date

df['daily_trips_per_station'] = df.groupby(['start_station_name', 'date'])['trips'].transform('sum')

In [12]:
# Calculate the total number of trips for each station by season

df['trips_per_season'] = df.groupby(['start_station_name', 'season'])['trips'].transform('sum')

In [13]:
df.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,...,member_casual,date,trips,bike_rides_daily,month,season,avg_temp,trips_per_start_station,daily_trips_per_station,trips_per_season
0,electric_bike,2022-08-27 13:56:47.728,2022-08-27 14:02:56.651,Flatbush Ave & Ocean Ave,3704.04,3 St & Prospect Park West,3865.05,40.663658,-73.963013,40.668133,...,casual,2022-08-27,1,121496,8,summer,82.040001,29876,179,12590
1,electric_bike,2022-08-20 10:37:02.756,2022-08-20 10:45:56.631,Forsyth St\t& Grand St,5382.07,E 11 St & 1 Ave,5746.14,40.717796,-73.993164,40.729538,...,casual,2022-08-20,1,132339,8,summer,82.220001,52973,329,20851
2,classic_bike,2022-08-31 18:55:03.051,2022-08-31 19:03:37.344,Perry St & Bleecker St,5922.07,Grand St & Greene St,5500.02,40.735355,-74.004829,40.721699,...,member,2022-08-31,1,126827,8,summer,78.080002,38894,134,10259
3,classic_bike,2022-08-02 08:05:00.250,2022-08-02 08:16:52.063,FDR Drive & E 35 St,6230.04,Grand Army Plaza & Central Park S,6839.1,40.744221,-73.971214,40.764397,...,member,2022-08-02,1,124176,8,summer,79.519997,51933,213,17595
4,electric_bike,2022-08-25 15:44:48.386,2022-08-25 15:55:39.691,E 40 St & 5 Ave,6474.11,Ave A & E 14 St,5779.11,40.752052,-73.982117,40.730312,...,member,2022-08-25,1,123775,8,summer,82.580002,35239,153,10411


In [14]:
df['start_station_name'].value_counts()

W 21 St & 6 Ave                 128823
West St & Chambers St           123045
Broadway & W 58 St              114040
6 Ave & W 33 St                 106236
1 Ave & E 68 St                 104685
                                 ...  
E 6 St 2 Ave                         6
Sharon St & Olive St_new             3
MTL-ECO5-LAB                         2
Schermerhorn St and Court St         2
JCBS Depot                           2
Name: start_station_name, Length: 1845, dtype: Int64

In [20]:
# Create reduced data set:

df_grouped = df.groupby(['start_station_name', 'date'], as_index = False).agg(
    season = ('season', 'first'),
    daily_trips_per_station=('daily_trips_per_station', 'first')  
)

In [21]:
df_grouped.head()

Unnamed: 0,start_station_name,date,season,daily_trips_per_station
0,1 Ave & E 110 St,2022-01-01,winter,26
1,1 Ave & E 110 St,2022-01-02,winter,38
2,1 Ave & E 110 St,2022-01-03,winter,44
3,1 Ave & E 110 St,2022-01-04,winter,31
4,1 Ave & E 110 St,2022-01-05,winter,37


In [22]:
df_grouped.shape

(581608, 4)

In [26]:
df_grouped.dtypes

start_station_name                 string
date                       datetime64[ns]
season                           category
daily_trips_per_station             int64
dtype: object

In [25]:
# Convert 'season' column to 'category' dtype

df_grouped['season'] = df_grouped['season'].astype('category')

In [27]:
df_grouped.to_parquet('2 Data/Prepped Data/reduced_for_streamlit.parquet', index = False)