In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
def correlation_matrices(los_angeles):
    la_numeric_cols = los_angeles.select_dtypes(include=[np.number]).columns

    # correlation matrix 
    corr_matrix = los_angeles[la_numeric_cols].corr()
    plt.figure(figsize=(12, 12))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
    plt.title("Correlation Matrix - Los Angeles")
    plt.show()

    # predictive values matrix 
    highly_correlated = ['AM_WAY_PHV','PM_WAY_PHV','AM_K_FACTOR_AMT','AM_D_FACTOR_AMT','AM_KD_FACTOR','PM_K_FACTOR_AMT','PM_D_FACTOR_AMT','PM_KD_FACTOR']
    corr_matrix = los_angeles[highly_correlated].corr()
    plt.figure(figsize=(12,12))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
    plt.title("Correlation Matrix - Los Angeles")
    plt.show()

In [14]:
def scatter_plots(los_angeles):
    # scatterplots
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=los_angeles, x='AM_HOUR', y='AM_WAY_PHV', alpha=0.5, edgecolor=None)
    plt.xlabel("Morning Hour (AM_HOUR)")
    plt.ylabel("Volume of Cars")
    plt.title("Traffic Volume by Morning Hour in Los Angeles")
    plt.grid(True)
    plt.show()

    # scatterplot with trend-line
    plt.figure(figsize=(10, 6))
    sns.regplot(data=los_angeles, x='AM_HOUR', y='AM_WAY_PHV', scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})
    plt.xlabel("Morning Hour (AM_HOUR)")
    plt.ylabel("AM Peak Hour Volume (AM_WAY_PHV)")
    plt.title("Traffic Volume by Morning Hour in Los Angeles (with Trendline)")
    plt.grid(True)
    plt.show()

In [15]:
YEARS = [2016,2017,2018,2019,2020,2021,2022]

def create_peak_hours():
    all_peak_hours = []
    for year in YEARS:
        file_path = f'./data/peak-hours/{year}-peak-hours.xlsx'
        try:
            df = pd.read_excel(file_path, sheet_name=f'{year} Peak Hour Report')
            df['YEAR'] = year  # Add year column for reference
            # print(df.shape)
            all_peak_hours.append(df)
        except FileNotFoundError:
            print(f"File not found for year {year}, skipping...")

    # Concatenate all years into a single DataFrame
    peak_hours_df = pd.concat(all_peak_hours, ignore_index=True)

    day_mapping = {
        'MON': 0, 'TUE': 1, 'WED': 2, 
        'THU': 3, 'FRI': 4, 'SAT': 5, 'SUN': 6
    }

    month_mapping = {
        'JAN': 0, 'FEB': 1, 'MAR': 2, 
        'APR': 3, 'MAY': 4, 'JUN': 5, 'JUL': 6,
        'AUG': 7,'SEP': 8,'OCT': 9,'NOV': 10,'DEC': 11
    }

    peak_hours_df['AM_DAY'] = peak_hours_df['AM_DAY'].replace(day_mapping)   
    peak_hours_df['AM_MONTH'] = peak_hours_df['AM_MONTH'].replace(month_mapping)
    # peak_hours_df.rename()

    return peak_hours_df

In [16]:
peak_hours = create_peak_hours()
peak_hours.drop(columns=['RTE_SFX','PM_SFX','PM_PFX','PRE','CS'],inplace=True)
peak_hours[['AM_DAY','PM_DAY','AM_HOUR','PM_HOUR','AM_MONTH','PM_MONTH','YEAR']]
peak_hours['AM_MONTH'].unique()
peak_hours.columns

File not found for year 2017, skipping...
File not found for year 2018, skipping...


  peak_hours_df['AM_DAY'] = peak_hours_df['AM_DAY'].replace(day_mapping)
  peak_hours_df['AM_MONTH'] = peak_hours_df['AM_MONTH'].replace(month_mapping)


Index(['DI', 'RTE', 'CO', 'PM', 'LEG', 'YR', 'AM_DIR', 'AM_WAY_PHV',
       'AM_K_FACTOR_AMT', 'AM_D_FACTOR_AMT', 'AM_KD_FACTOR', 'AM_HOUR',
       'AM_DAY', 'AM_MONTH', 'PM_DIR', 'PM_WAY_PHV', 'PM_K_FACTOR_AMT',
       'PM_D_FACTOR_AMT', 'PM_KD_FACTOR', 'PM_HOUR', 'PM_DAY', 'PM_MONTH',
       'YEAR'],
      dtype='object')

In [17]:
""" STATIC """

COUNTIES = ['LA','ORA','SD','SB']

AM_TIME_BASED_FEATURES = ['AM_HOUR', 'AM_DAY', 'AM_MONTH','YEAR','PM']
PM_TIME_BASED_FEATURES = ['PM_HOUR', 'PM_DAY', 'PM_MONTH','YEAR','PM']

DIRECTIONS = ['N','E','S','W']

# MON - 0 , SUN - 6
DAYS = [0,1,2,3,4,5,6]

# JAN - 0 , DEC - 11
MONTHS = [0,1,2,3,4,5,6,7,8,9,10,11]
YEARS = [2016,2017,2018,2019,2020,2021,2022]

morning_results = []
afternoon_results = []

AM_TIME_BASED_FEATURES = ['AM_HOUR', 'AM_DAY', 'AM_MONTH','YEAR','PM']

# MORNING TIME
for year in YEARS:
    df_year = peak_hours[peak_hours['YEAR'] == year]

    for county in COUNTIES: 
        df_county = df_year[df_year['CO'] == county]  # Filter by county
        
        for direction in DIRECTIONS:
            df_dir = df_county[df_county['AM_DIR'] == direction]  # Filter by direction
            
            for month in MONTHS:
                df_month = df_dir[df_dir['AM_MONTH'] == month]

                for day in DAYS:
                    df_day = df_month[df_month['AM_DAY'] == day]  # Filter by day
                    
                    if df_day.shape[0] == 0:
                        continue  # Skip empty results
                    
                    for index, row in df_day.iterrows():
                        morning_results.append({
                            'Year': year, 
                            'Month': month, 
                            'County': county, 
                            'Direction': direction, 
                            'Hour': row['AM_HOUR'],
                            'Day': row['AM_DAY'], 
                            'PHV': row['AM_WAY_PHV']
                        })


# Convert results into a DataFrame for EDA
morning_eda_df = pd.DataFrame(morning_results)

In [18]:
morning_eda_df
morning_eda_df.columns

# AM_DIRECTION_FEATURES = ['AM_DIR','AM_WAY_PHV','AM_K_FACTOR_AMT','AM_D_FACTOR_AMT','AM_KD_FACTOR']
# PM_DIRECTION_FEATURES = ['PM_DIR','PM_WAY_PHV','PM_K_FACTOR_AMT','PM_D_FACTOR_AMT','PM_KD_FACTOR']

Index(['Year', 'Month', 'County', 'Direction', 'Hour', 'Day', 'PHV'], dtype='object')

In [21]:
# Make sure your DataFrame exists
df = morning_eda_df.copy()

# Create a datetime object (defaults hour to 0 if missing, but we have Hour!)
df['DateTime'] = pd.to_datetime(dict(year=df.Year, month=df.Month+1, day=df.Day, hour=df.Hour))

# Time-based features
df['Weekday'] = df['DateTime'].dt.weekday  # Monday=0, Sunday=6
df['IsWeekend'] = df['Weekday'] >= 5
df['Quarter'] = df['DateTime'].dt.quarter
df['IsRushHour'] = df['Hour'].between(7, 9)  # Customize this

ValueError: cannot assemble the datetimes: time data "20160200" doesn't match format "%Y%m%d", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [19]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR  # Regression SVM

# Copy data
df = morning_eda_df.copy()

# Encode categorical features
le_county = LabelEncoder()
le_direction = LabelEncoder()
df['County'] = le_county.fit_transform(df['County'])
df['Direction'] = le_direction.fit_transform(df['Direction'])

# Features and target
X = df.drop('PHV', axis=1)
y = df['PHV']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

svm = SVR(kernel='rbf')  # you can also try 'linear', 'poly'
svm.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

MSE: 13092137.23839206
R^2 Score: 0.00259744068021428
