In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
def correlation_matrices(los_angeles):
    la_numeric_cols = los_angeles.select_dtypes(include=[np.number]).columns

    # correlation matrix 
    corr_matrix = los_angeles[la_numeric_cols].corr()
    plt.figure(figsize=(12, 12))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
    plt.title("Correlation Matrix - Los Angeles")
    plt.show()

    # predictive values matrix 
    highly_correlated = ['AM_WAY_PHV','PM_WAY_PHV','AM_K_FACTOR_AMT','AM_D_FACTOR_AMT','AM_KD_FACTOR','PM_K_FACTOR_AMT','PM_D_FACTOR_AMT','PM_KD_FACTOR']
    corr_matrix = los_angeles[highly_correlated].corr()
    plt.figure(figsize=(12,12))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
    plt.title("Correlation Matrix - Los Angeles")
    plt.show()

In [37]:
def scatter_plots(los_angeles):
    # scatterplots
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=los_angeles, x='AM_HOUR', y='AM_WAY_PHV', alpha=0.5, edgecolor=None)
    plt.xlabel("Morning Hour (AM_HOUR)")
    plt.ylabel("Volume of Cars")
    plt.title("Traffic Volume by Morning Hour in Los Angeles")
    plt.grid(True)
    plt.show()

    # scatterplot with trend-line
    plt.figure(figsize=(10, 6))
    sns.regplot(data=los_angeles, x='AM_HOUR', y='AM_WAY_PHV', scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})
    plt.xlabel("Morning Hour (AM_HOUR)")
    plt.ylabel("AM Peak Hour Volume (AM_WAY_PHV)")
    plt.title("Traffic Volume by Morning Hour in Los Angeles (with Trendline)")
    plt.grid(True)
    plt.show()

In [None]:
YEARS = [2016,2017,2018,2019,2020,2021,2022]

def create_peak_hours():
    all_peak_hours = []
    for year in YEARS:
        file_path = f'./data/peak-hours/{year}-peak-hours.xlsx'
        try:
            df = pd.read_excel(file_path, sheet_name=f'{year} Peak Hour Report')
            df['YEAR'] = year  # Add year column for reference
            # print(df.shape)
            all_peak_hours.append(df)
        except FileNotFoundError:
            print(f"File not found for year {year}, skipping...")

    # Concatenate all years into a single DataFrame
    peak_hours_df = pd.concat(all_peak_hours, ignore_index=True)

    day_mapping = {
        'MON': 0, 'TUE': 1, 'WED': 2, 
        'THU': 3, 'FRI': 4, 'SAT': 5, 'SUN': 6
    }

    month_mapping = {
        'JAN': 0, 'FEB': 1, 'MAR': 2, 
        'APR': 3, 'MAY': 4, 'JUN': 5, 'JUL': 6,
        'AUG': 7,'SEP': 8,'OCT': 9,'NOV': 10,'DEC': 11
    }

    peak_hours_df['AM_DAY'] = peak_hours_df['AM_DAY'].replace(day_mapping)   
    peak_hours_df['AM_MONTH'] = peak_hours_df['AM_MONTH'].replace(month_mapping)
    # peak_hours_df.rename()

    return peak_hours_df

In [45]:
peak_hours = create_peak_hours()
peak_hours.drop(columns=['RTE_SFX','PM_SFX','PM_PFX','PRE','CS'],inplace=True)
peak_hours[['AM_DAY','PM_DAY','AM_HOUR','PM_HOUR','AM_MONTH','PM_MONTH','YEAR']]
peak_hours['AM_MONTH'].unique()
peak_hours.columns

  peak_hours_df['AM_DAY'] = peak_hours_df['AM_DAY'].replace(day_mapping)
  peak_hours_df['AM_MONTH'] = peak_hours_df['AM_MONTH'].replace(month_mapping)


Index(['DI', 'RTE', 'CO', 'PM', 'LEG', 'YR', 'AM_DIR', 'AM_WAY_PHV',
       'AM_K_FACTOR_AMT', 'AM_D_FACTOR_AMT', 'AM_KD_FACTOR', 'AM_HOUR',
       'AM_DAY', 'AM_MONTH', 'PM_DIR', 'PM_WAY_PHV', 'PM_K_FACTOR_AMT',
       'PM_D_FACTOR_AMT', 'PM_KD_FACTOR', 'PM_HOUR', 'PM_DAY', 'PM_MONTH',
       'YEAR'],
      dtype='object')

In [41]:
""" STATIC """

COUNTIES = ['LA','ORA','SD','SB']

AM_TIME_BASED_FEATURES = ['AM_HOUR', 'AM_DAY', 'AM_MONTH','YEAR','PM']
PM_TIME_BASED_FEATURES = ['PM_HOUR', 'PM_DAY', 'PM_MONTH','YEAR','PM']

DIRECTIONS = ['N','E','S','W']

# MON - 0 , SUN - 6
DAYS = [0,1,2,3,4,5,6]

# JAN - 0 , DEC - 11
MONTHS = [0,1,2,3,4,5,6,7,8,9,10,11]
YEARS = [2016,2017,2018,2019,2020,2021,2022]

morning_results = []
afternoon_results = []

# MORNING TIME
for year in YEARS:
    df_year = peak_hours[peak_hours['YEAR'] == year]

    for county in COUNTIES: 
        df_county = df_year[df_year['CO'] == county]  # Filter by county
        
        for direction in DIRECTIONS:
            df_dir = df_county[df_county['AM_DIR'] == direction]  # Filter by direction
            
            for month in MONTHS:
                df_month = df_dir[df_dir['AM_MONTH'] == month]

                for day in DAYS:
                    df_day = df_month[df_month['AM_DAY'] == day]  # Filter by day
                    
                    if df_day.shape[0] == 0:
                        continue  # Skip empty results
                    
                    mean_phv = round(df_day['AM_WAY_PHV'].mean(),2)  # Compute mean

                    # Store results in a structured format
                    morning_results.append({'Year': year, 'Month': month, 'County': county, 'Direction': direction, 'Day': day, 'Mean_PHV': mean_phv})

# Convert results into a DataFrame for EDA
morning_eda_df = pd.DataFrame(morning_results)


# Pivot table for better visualization
# pivot_eda = morning_eda_df.pivot(index='Day', columns=['Year', 'Month','County', 'Direction'], values='Mean_PHV')
# pivot_eda

Year,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,...,2022,2022,2022,2022,2022,2022,2022,2022,2022,2022
Month,1,2,3,4,5,6,7,9,10,11,...,6,8,9,10,1,6,7,8,9,10
County,LA,LA,LA,LA,LA,LA,LA,LA,LA,LA,...,SB,SB,SB,SB,SB,SB,SB,SB,SB,SB
Direction,N,N,N,N,N,N,N,N,N,N,...,S,S,S,S,W,W,W,W,W,W
Day,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
0,2106.0,,4530.0,3036.5,,,,,,,...,,,,,,,,,,
1,4262.0,,,2646.0,,9007.0,,6118.0,3150.0,,...,,293.0,1630.0,1285.5,662.0,,,,,
2,,7932.0,7651.0,,8656.5,,7583.0,,7788.0,,...,,,,2936.0,,,,552.0,,1032.0
3,,9802.0,,,,,11041.0,1388.5,,,...,,,5416.0,924.0,,,,,,
4,,,,,,,,,2407.0,,...,,,,,,,,,,
5,,,,,,,1440.0,,8571.0,1664.0,...,,,,,,966.0,524.0,,,
6,,,8711.0,,269.0,,,,,,...,1316.0,,,,,,,,1352.0,


In [44]:
morning_eda_df

# AM_DIRECTION_FEATURES = ['AM_DIR','AM_WAY_PHV','AM_K_FACTOR_AMT','AM_D_FACTOR_AMT','AM_KD_FACTOR']
# PM_DIRECTION_FEATURES = ['PM_DIR','PM_WAY_PHV','PM_K_FACTOR_AMT','PM_D_FACTOR_AMT','PM_KD_FACTOR']

Unnamed: 0,Year,Month,County,Direction,Day,Mean_PHV
0,2016,1,LA,N,0,2106.0
1,2016,1,LA,N,1,4262.0
2,2016,2,LA,N,2,7932.0
3,2016,2,LA,N,3,9802.0
4,2016,3,LA,N,0,4530.0
...,...,...,...,...,...,...
1455,2022,6,SB,W,5,966.0
1456,2022,7,SB,W,5,524.0
1457,2022,8,SB,W,2,552.0
1458,2022,9,SB,W,6,1352.0
