In [None]:
# notebooks/EDA.ipynb

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style for consistent visuals
sns.set(style="whitegrid")

# Load Data
# Assuming the traffic data is in data/raw/traffic_data.csv
data_path = "../data/raw/traffic_data.csv"
df = pd.read_csv(data_path)

# Display basic information about the dataset
print("Dataset Info:")
df.info()

# Display first few rows
print("\nFirst 5 Rows of the Dataset:")
df.head()

# Summary Statistics
print("\nSummary Statistics:")
df.describe()

# Check for missing values
print("\nMissing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Basic Data Cleaning
# Filling missing values if any in the 'traffic_volume' column with median as a placeholder
if 'traffic_volume' in df.columns:
    df['traffic_volume'].fillna(df['traffic_volume'].median(), inplace=True)

# Exploratory Visualizations

# 1. Traffic Volume Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['traffic_volume'], bins=30, kde=True)
plt.title("Traffic Volume Distribution")
plt.xlabel("Traffic Volume")
plt.ylabel("Frequency")
plt.show()

# 2. Traffic Volume by Day of the Week
if 'day_of_week' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='day_of_week', y='traffic_volume', data=df)
    plt.title("Traffic Volume by Day of the Week")
    plt.xlabel("Day of the Week")
    plt.ylabel("Traffic Volume")
    plt.show()

# 3. Traffic Volume by Hour of Day
if 'hour' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.lineplot(x='hour', y='traffic_volume', data=df.groupby('hour').mean().reset_index())
    plt.title("Average Traffic Volume by Hour of Day")
    plt.xlabel("Hour of Day")
    plt.ylabel("Average Traffic Volume")
    plt.show()

# 4. Correlation Heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Heatmap")
plt.show()

# Insights and Observations
# Based on the visualizations above, jot down any initial observations, for example:
# - Peak traffic hours during weekdays and weekends
# - Correlation between weather conditions (if available) and traffic volume
# - Other potential areas of interest for optimization

# Save cleaned data for modeling if needed
df.to_csv("../data/processed/traffic_data_cleaned.csv", index=False)

# End of EDA
print("EDA Complete. Cleaned data saved to 'data/processed/traffic_data_cleaned.csv'")
