## Import libraries and data

In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
from streamlit_keplergl import keplergl_static

In [2]:
df = pd.read_csv(r"C:\Users\Drew\New_York_CitiBike\02 Data\Prepared Data\CitiBike_NOAA_Rides.csv", dtype={5: 'str', 7: 'str'})

In [3]:
df_geo = pd.read_csv(r"C:\Users\Drew\New_York_CitiBike\Trips_Geo_Stations.csv", index_col = 0)

In [4]:
df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
date                   object
avgTemp               float64
number_of_rides         int64
_merge                 object
tripduration          float64
gender                 object
dtype: object

In [5]:
df_geo.dtypes

start_station_name     object
end_station_name       object
trips                   int64
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
dtype: object

## Creating 'month' and 'season' columns

In [6]:
# create month column
# convert the 'date' column to datetime format using the specified format ('%Y-%m-%d')
# extract the month from the 'date' column and create a new 'month' column
# setting month column as int data type

df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df['month'] = df['date'].dt.month
df['month'] = df['month'].astype('int')

In [7]:
# create season column

df['season'] = [
    "winter" if (month== 12 or 1 <= month <= 4)
    else "spring" if (4 < month <= 5)
    else "summer" if (6 <= month <= 9)
    else "fall"
for month in df['month']
]
    

## Using plotly to produce bar chart of most popular stations

In [8]:
# creating station column count to find most popular station

df['value'] = 1
df_station_bar_count = df.groupby('start_station_name',as_index=False).agg({'value':'sum'})
top20_stations = df_station_bar_count.nlargest(20,'value')

In [9]:
top20_stations.head()

Unnamed: 0,start_station_name,value
1587,W 21 St & 6 Ave,128823
1718,West St & Chambers St,123045
495,Broadway & W 58 St,114040
286,6 Ave & W 33 St,106236
8,1 Ave & E 68 St,104685


In [None]:
# create top 20 stations bar chart using plotly

# Define a custom colorscale using shades of blue
colorscale = [
    [0.0, '#004B87'],  # Darker shade of blue
    [0.5, '#0067B1'],  # CitiBike blue
    [1.0, '#66C5E9']   # Lighter blue, more saturated
]

fig_top20 = go.Figure(go.Bar(x = top20_stations['start_station_name'], y=top20_stations['value'], marker=dict(color=top20_stations['value'],colorscale=colorscale)))
fig_top20.update_layout(xaxis_tickangle=20)

fig_top20.update_layout(
    title = 'Top 20 Most Popular CitiBike Stations in New York',
    xaxis_title = 'Start Stations',
    yaxis_title = 'Number of Times Stations Used by Riders',
    width = 900, height = 600)


In [11]:
import gc
gc.collect()

279

## Creating Pie Chart of Member Usage

In [12]:
# create member count focused dataframe

df_member=df.groupby('member_casual',as_index=False).agg({'value':'sum'})

In [None]:
# create member usage pie chart using plotly

# define custom colorscale with darker and light CitiBike blue
colors = ['#004B87', '#0067B1']

fig_member = go.Figure(go.Pie(labels=df_member['member_casual'],values=df_member['value'],marker=dict(colors=colors),textinfo='percent+label'))

fig_member.update_layout(
    title='CitiBike Member Usage in New York',
    xaxis_title='Type of Rider',
    yaxis_title='Number of Trips by Riders',
    width=900,
    height=600)

In [14]:
import gc
gc.collect()

85

## Creating Bar Chart of Rides Based on Season

In [15]:
# create season count focused dataframe

df_season=df.groupby('season',as_index=False).agg({'value':'sum'})

In [None]:
# create season usage bar chart using plotly

# Define a custom colorscale using shades of blue
colorscale = [
    [0.0, '#004B87'],  # Darker shade of blue
    [0.5, '#0067B1'],  # CitiBike blue
    [1.0, '#66C5E9']   # Lighter blue, more saturated
]

fig_season = go.Figure(go.Bar(x = df_season['season'], y=df_season['value'], marker=dict(color=df_season['value'],colorscale=colorscale)))
fig_season.update_layout()

fig_season.update_layout(
    title = 'Seasonal Bike Usage',
    xaxis_title = 'Season',
    yaxis_title = 'Number of Rides by Season',
    width = 900, height = 600)

In [17]:
import gc
gc.collect()

17

## Creating Pie Chart of Type of Bike Used

In [18]:
# create member count focused dataframe

df_bike_type=df.groupby('rideable_type',as_index=False).agg({'value':'sum'})

In [None]:
# create pie chart of type of bikes used using plotly

# define custom colorscale with darker and light CitiBike blue
colors = ['#004B87', '#0067B1']

fig_bike_type = go.Figure(go.Pie(labels=df_bike_type['rideable_type'],values=df_bike_type['value'],marker=dict(colors=colors),textinfo='percent+label'))

fig_bike_type.update_layout(
    title='Comparison of Bike Type Used by Riders in New York',
    xaxis_title='Type of Bike',
    yaxis_title='Number of Trips by Riders',
    width=900,
    height=600)

In [20]:
import gc
gc.collect()

208

## Creating Dual-Line Chart of Bike Trips

In [21]:
# Create a new DataFrame with only necessary columns
df_line = df[['date', 'number_of_rides', 'avgTemp']]

In [22]:
# Due to size of data, a sample size of the data was created to help show line chart

df_sample = df_line.sample(n=2000000, random_state=42)

In [23]:
df_sample = df_sample.sort_values(by='date')

In [None]:
# Dual axis Line chart
fig_trip_temp = make_subplots(specs=[[{"secondary_y": True}]])

fig_trip_temp.add_trace(
    go.Scatter(
        x=df_sample['date'],
        y=df_sample['number_of_rides'],
        name='Daily Bike Rides',
        mode='lines',  # Only plot lines, not markers
        line=dict(color='blue'),
    ),
    secondary_y=False
)

fig_trip_temp.add_trace(
    go.Scatter(
        x=df_sample['date'],
        y=df_sample['avgTemp'],
        name='Daily Temperature',
        mode='lines',  # Only plot lines, not markers
        line=dict(color='red'),
    ),
    secondary_y=True
)

fig_trip_temp.update_layout(
    title='Daily Bike Trips vs Average Temperature in New York',
    width=900,
    height=600,
    xaxis_title="Date",
    yaxis_title="Number of daily bike rides",
    yaxis2_title="Average temperatures (°F)"
)

fig_trip_temp.show()


In [25]:
import gc
gc.collect()

16

 ## Reducing rows and columns

In [26]:
df_1 = df.drop(columns = {'ride_id','started_at','ended_at','start_station_id','end_station_id','_merge','gender'})

In [27]:
df_1.columns

Index(['rideable_type', 'start_station_name', 'end_station_name', 'start_lat',
       'start_lng', 'end_lat', 'end_lng', 'member_casual', 'date', 'avgTemp',
       'number_of_rides', 'tripduration', 'month', 'season', 'value'],
      dtype='object')

## Create Random Split

In [28]:
np.random.seed(42)
red = np.random.rand(len(df_1)) <= 0.92

In [29]:
small = df_1[~red]

In [30]:
small.shape

(2380560, 15)