# Exercise 2.6 Task

## 01 - 02. Import Libraries and Data

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
from streamlit_keplergl import keplergl_static

In [None]:
df = pd.read_csv('newyork_data_temps_bikeridesdaily.csv', index_col = 0)

In [None]:
df.dtypes

In [None]:
df.head(10)

In [None]:
# Adding date column back in as column, not as index

# If 'date' already exists as a column, drop it first
if 'date' in df.columns:
    df = df.drop(columns=['date'])

# Safely reset the index (date becomes a column)
df = df.reset_index()

# Convert it to datetime dtype
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')


In [None]:
df.shape

In [None]:
df.describe()

In [None]:
# Looking for missing values

df.isnull().sum()

In [None]:
df.head(10)

In [None]:
# Create a month column 

df['month'] = df['date'].dt.month
df['month'] = df['month'].astype('int')

In [None]:
# Create the season column

df['season'] = [
"winter" if (month == 12 or 1 <= month <= 4)
    else "spring" if (4 < month <= 5)
    else "summer" if (6 <= month <= 9)
    else "fall"
for month in df['month']
    ]

In [None]:
df.shape

In [None]:
df.columns

## 03. Create Plotly Chart

In [None]:
# Create Top 20 Groupby

df['value'] = 1 
df_groupby_bar = df.groupby('start_station_name', as_index=False).agg({'value': 'sum'})
top20 = df_groupby_bar.nlargest(20, 'value')

In [None]:
# Bar Chart

fig = go.Figure(go.Bar(x = top20['start_station_name'], y = top20['value'], marker={'color': top20['value'],'colorscale': 'Blues'}))
fig.show()

In [None]:
# Bar chart

fig.update_layout(
    title = 'Top 20 Most Popular Bike Stations in New York',
    xaxis_title = 'Start Stations',
    yaxis_title ='Sum of Trips',
    width = 900, height = 600
)

In [None]:
# Save the top 20 stations as a CSV file

top20.to_csv('newyork_top20.csv')

## 04. Dual-Axis Chart

In [None]:
# Ensure date is datetime and sort
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# Dual Axis 
fig2 = make_subplots(specs=[[{"secondary_y": True}]])

fig2.add_trace(
    go.Scatter(
        x=df['date'], 
        y=df['bike_rides_daily'], 
        name='Daily Trips',
        mode='lines',           
        line_shape='spline'
    ),
    secondary_y=False
)

fig2.add_trace(
    go.Scatter(
        x=df['date'], 
        y=df['avgTemp'], 
        name='Daily Temperature',
        mode='lines',           
        line_shape='spline'
    ),
    secondary_y=True
)

fig2.update_layout(
    title='Daily Trips vs Temperature',
    xaxis_title='Date',
    yaxis_title='Trips',
    yaxis2_title='Temperature (°F)',
    template='plotly_white'
)

fig2.show()

In [None]:
import gc # this is a garbage collector
gc.collect()

In [None]:
df.columns

In [None]:
# Create a copy with fewer columns

df_1 = df.drop(columns = {'ride_id', 'started_at', 'ended_at',
       'start_station_id', 'end_station_id', 'merge_flag', 'started_at_dt'}) 

In [None]:
df_1.columns

## Create a random split

In [None]:
np.random.seed(32)
red = np.random.rand(len(df_1)) <= 0.92

In [None]:
small = df_1[~red]

In [None]:
small.shape

In [None]:
small.to_csv('reduced_ny_data_to_plot_7.csv',index = False)

In [None]:
df_1.to_csv('reduced_ny_data_to_plot.csv')