<a href="https://colab.research.google.com/github/DineshY1011/US_Accident/blob/main/Milestone_2/Week_4/Day_20/data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [None]:
import pandas as pd
import numpy as np

Load Dataset

In [None]:
file_path = "/content/drive/MyDrive/US_Dataset/US_Accidents_sampled_1M.csv"
df = pd.read_csv(file_path, low_memory=False)
print(f"Dataset loaded successfully with shape: {df.shape}")

Dataset loaded successfully with shape: (1000000, 46)


Drop Unnecessary Columns

In [None]:
# Columns that do not affect severity prediction
drop_cols = [
    "ID", "Source", "End_Lat", "End_Lng", "Description", "Street",
    "Country", "Timezone", "Airport_Code", "Weather_Timestamp",
    "Nautical_Twilight", "Astronomical_Twilight"
]

df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)
print(f"Dropped unnecessary columns. Remaining shape: {df.shape}")

Dropped unnecessary columns. Remaining shape: (1000000, 34)


Remove Duplicate Rows

In [None]:
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]
print(f"Removed {before - after} duplicate rows.")

Removed 2111 duplicate rows.


Handle Missing Values

In [None]:
# Drop rows where Severity is missing (target column)
if "Severity" in df.columns:
    df = df.dropna(subset=["Severity"])

# Fill numeric columns with median
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical/boolean columns with mode
cat_cols = df.select_dtypes(include=["object", "bool"]).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else "Unknown")

print("Missing values handled successfully.")

Missing values handled successfully.


Fix Data Types

In [None]:
# Convert boolean-like columns to proper bool type
bool_cols = [
    "Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit",
    "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming",
    "Traffic_Signal", "Turning_Loop"
]

for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(bool)

# Convert date/time columns to datetime
datetime_cols = ["Start_Time", "End_Time"]
for col in datetime_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

print("Data types fixed successfully.")

Data types fixed successfully.


Handle Outliers

In [None]:
# Remove unrealistic negative or zero distances
if "Distance(mi)" in df.columns:
    df = df[df["Distance(mi)"] > 0]

# Remove temperature, humidity, pressure outliers if extreme
if "Temperature(F)" in df.columns:
    df = df[(df["Temperature(F)"] > -50) & (df["Temperature(F)"] < 130)]

if "Humidity(%)" in df.columns:
    df = df[(df["Humidity(%)"] >= 0) & (df["Humidity(%)"] <= 100)]

print("Outliers handled successfully.")

Outliers handled successfully.


Standardize Text Columns

In [None]:
# Convert all text columns to consistent format
text_cols = df.select_dtypes(include="object").columns
for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.title()

Save Cleaned Dataset

In [None]:
output_path = "cleaned_us_accidents.csv"
df.to_csv(output_path, index=False)
print(f"Cleaned dataset saved successfully as: {output_path}")
print(f"Final cleaned shape: {df.shape}")

Cleaned dataset saved successfully as: cleaned_us_accidents.csv
Final cleaned shape: (570562, 34)
