## Dataset Reduction

In [None]:
import os
import pandas as pd

# Checking if filtered_data.csv already exists
if not os.path.exists('filtered_data.csv'):
    df_huge = pd.read_csv('Accident_Information.csv', low_memory=False)

    # Filter for Year 2014-2017
    df = df_huge[df_huge['Year'].between(2014, 2017)].copy()

    # Save to a new CSV
    df.to_csv('filtered_data.csv', index=False)
else:
    print("filtered_data.csv already exists, skipping file creation.")


## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

#above, we import all necessary libraries

In [None]:
# load the dataset
df = pd.read_csv('filtered_data.csv', low_memory=False)

# I am using a different filtered dataset with about 550k rows/entries, as the original
# had 2,000,000+ rows and was inefficient and consuming way too much memory and time

# here, we display first few rows to get a preview of our data
df.head()

## 2. Explore the Dataset

In [None]:
# This will give us a brief summary of the dataset
df.info()
print("")
print("______________________________________________________________")
print("")
# This gives us the number of null values 
print(df.isnull().sum())
print("")
print("______________________________________________________________")
print("")
# This will tell us all the columns we have in our dataset
print(df.columns)


## 3. Preprocess the Data

In [None]:
# label encode 'Day_of_Week' and 'Road_Type', then convert to int32 for consistency

le_day = LabelEncoder()
le_road = LabelEncoder()

df['Day_of_Week'] = le_day.fit_transform(df['Day_of_Week']).astype(np.int32)
df['Road_Type'] = le_road.fit_transform(df['Road_Type']).astype(np.int32)


# then we select features for clustering
features = ['Day_of_Week', 'Number_of_Casualties']

# create a variable X with the selected features
X = df[features].copy()  # .copy() for safety cuz it creates a copy

# check for missing values in selected features
print(X.isnull().sum())



# after, we scale the features

X = df[features].fillna(0)
X_scaled = StandardScaler().fit_transform(X)
df['Cluster'] = KMeans(n_clusters=3, random_state=42).fit_predict(X_scaled)

In [None]:
# we check on the prepped data — shape, stats, and nulls

print(X.shape)
print(X.describe())
print(X.isnull().sum())

## 4. Apply K-Means Clustering

In [None]:
# we attempt the elbow method for optimal k, where inertia is used to see how
# cluster compactness changes as k increases

inertia = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
plt.plot(range(2, 10), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

#the "Elbow" is at 3 clusters, as we see the rate of decrease slow down.

In [None]:
# now, we fit K-Means with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)

# add cluster labels to DataFrame
df['Cluster'] = kmeans.labels_

# see how many rows ended up in each cluster
df['Cluster'].value_counts()

## 5. Visualize Clusters

In [None]:
# we visualize the clusters with a barplot, where we can show the correlation between casualties
# and day of the week between the clusters

sns.barplot(
    x=features[0],
    y=features[1],
    hue='Cluster',
    data=df,
    palette='viridis',
    errorbar=None
)

plt.title(f'Average {features[1]} by {features[0]} and Cluster')
plt.xlabel(features[0])
plt.ylabel(f'Average {features[1]}')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()


In [None]:
print(df.groupby(['Day_of_Week', 'Cluster']).size())

# The clustering algorithm naturally separated the data into 
# patterns, where Cluster 2 dominates the early part of the week, 
# Cluster 0 dominates the second half, and Cluster 1 appears consistently 
# but scattered across all days.”

In [None]:
from sklearn.decomposition import PCA

# Reduce features to 2 principal components for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create DataFrame for plotting
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['Cluster'] = df['Cluster']

# Sample for faster plotting
pca_df_sample = pca_df


# Scatter plot of clusters in PCA-reduced space
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=pca_df_sample, palette='viridis')
plt.title('Clusters Visualized Using PCA Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster', loc='upper right')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Fit KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)

# Plot clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans.labels_, cmap='viridis', s=10, alpha=0.5, label='Data')

# Plot centroids
centroids = kmeans.cluster_centers_
centroids_2d = pca.transform(centroids)  # reduce centroids to 2D as well
plt.scatter(centroids_2d[:, 0], centroids_2d[:, 1], c='red', s=100, marker='X', label='Centroids')

plt.title("K-Means Clusters with Centroids (PCA 2D)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.show()
