### Assignment-02

1. What is the distribution of accident severity levels?

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np

# --- Setup: Load Dataset Once ---
# NOTE: Replace the file path with the actual location of your 'US_Accidents_March23.csv' file.
try:
    df = pd.read_csv("C:/Users/win10/Desktop/US_Accidents_March23.csv")
    df['Start_Time'] = pd.to_datetime(df['Start_Time'])
    df['Hour'] = df['Start_Time'].dt.hour
    df['DayOfWeek'] = df['Start_Time'].dt.dayofweek
    print("Dataset loaded and time features extracted.")
except FileNotFoundError:
    print("Error: Dataset file not found. Using a small sample DataFrame for demonstration.")
    data = {'Severity': [1, 2, 2, 3, 4, 2, 1, 2],
            'Start_Time': pd.to_datetime(['2023-01-01 08:00:00', '2023-01-01 17:00:00', '2023-01-02 01:00:00', '2023-01-02 12:00:00', '2023-01-03 07:30:00', '2023-01-04 08:30:00', '2023-01-05 18:00:00', '2023-01-06 17:30:00']),
            'Weather_Condition': ['Clear', 'Rain', 'Clear', 'Clear', 'Clear', 'Heavy Rain', 'Clear', 'Clear'],
            'Sunrise_Sunset': ['Day', 'Day', 'Night', 'Day', 'Day', 'Day', 'Day', 'Day'],
            'Roundabout': [False, True, False, False, False, True, False, False]}
    df = pd.DataFrame(data)
    df['Hour'] = df['Start_Time'].dt.hour
    df['DayOfWeek'] = df['Start_Time'].dt.dayofweek

# --- Question 1 Code ---
severity_counts = df['Severity'].value_counts().sort_index()
print("Severity Counts:")
print(severity_counts)

# Visualization
fig_severity = px.bar(
    x=severity_counts.index,
    y=severity_counts.values,
    title="Accident Severity Distribution",
    labels={"x": "Accident Severity", "y": "Count"},
    color=severity_counts.index.astype(str),
    color_discrete_sequence=px.colors.qualitative.Bold
)
fig_severity.update_layout(xaxis=dict(tickmode='linear', dtick=1))
fig_severity.show()

2. How is the accident frequency distributed by hour of the day?

In [None]:
# --- Question 2 Code ---
print("\n### 2. Accident Frequency by Hour of the Day")
hourly_counts = df['Hour'].value_counts().sort_index()
print("Hourly Accident Counts:")
print(hourly_counts)

# Visualization
fig_hourly = px.line(
    x=hourly_counts.index,
    y=hourly_counts.values,
    title="Accident Frequency by Hour of Day",
    labels={"x": "Hour of Day", "y": "Number of Accidents"}
)
fig_hourly.update_layout(xaxis=dict(tickmode='linear', dtick=1))
fig_hourly.show()

3. Which days of the week have the highest number of accidents?

In [None]:
# --- Question 3 Code ---
print("\n### 3. Accident Frequency by Day of the Week")
day_of_week_counts = df['DayOfWeek'].value_counts().sort_index()
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# Rename index for better readability
day_of_week_counts.index = [days[i] for i in day_of_week_counts.index]
print("Accident counts by Day of Week:")
print(day_of_week_counts)

most_accident_day = day_of_week_counts.idxmax()
print(f"\nThe day with the highest number of accidents is: **{most_accident_day}**")

4. What are the most common weather conditions during accidents?

In [None]:
# --- Question 4 Code ---
print("\n### 4. Most Common Weather Conditions During Accidents")
if 'Weather_Condition' in df.columns:
    weather_counts = df['Weather_Condition'].value_counts().nlargest(5)
    print("Top 5 Weather Conditions:")
    print(weather_counts)

    # Visualization
    fig_weather = px.pie(
        names=weather_counts.index,
        values=weather_counts.values,
        title="Top 5 Weather Conditions During Accidents"
    )
    fig_weather.show()
else:
    print("Weather_Condition column not found.")

5. How to identify columns with missing data and their missing percentage?

In [None]:
# --- Question 5 Code ---
print("\n### 5. Identifying Columns with Missing Data and Percentage")
missing_percent = round((df.isnull().sum() / df.shape[0]) * 100, 2)
missing_info = missing_percent[missing_percent > 0].sort_values(ascending=False)

print("Columns with missing data and their percentage (only showing > 0%):")
print(missing_info)

columns_with_missing = missing_info.index.tolist()
print(f"\nTotal columns with missing data: {len(columns_with_missing)}")

6. How to impute missing numerical values with median?

In [None]:
# --- Question 6 Code ---
print("\n### 6. Imputing Missing Numerical Values with Median")

# Identify numerical columns that still have missing data
num_cols_to_impute = df.select_dtypes(include=np.number).columns[df.select_dtypes(include=np.number).isnull().any()].tolist()
imputed_cols_6 = []

for col in num_cols_to_impute:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)
    imputed_cols_6.append(col)

print(f"Numerical imputation complete. Columns imputed: {imputed_cols_6}")
print(f"Check: Missing count in 'Temperature(F)': {df['Temperature(F)'].isnull().sum()}") # Example check

7. How to impute missing categorical values with the mode?

In [None]:
# --- Question 7 Code ---
print("\n### 7. Imputing Missing Categorical Values with Mode")

# Identify categorical (object/string) columns that still have missing data
cat_cols_to_impute = df.select_dtypes(include=['object']).columns[df.select_dtypes(include=['object']).isnull().any()].tolist()
imputed_cols_7 = []

for col in cat_cols_to_impute:
    # Use .mode()[0] to select the first mode in case of a tie
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)
    imputed_cols_7.append(col)

print(f"Categorical imputation complete. Columns imputed: {imputed_cols_7}")
print(f"Check: Missing count in 'Weather_Condition': {df['Weather_Condition'].isnull().sum()}") # Example check

8. How to label encode a categorical column?

In [None]:
from sklearn.preprocessing import LabelEncoder

# --- Question 8 Code ---
print("\n### 8. Label Encoding a Categorical Column")

# Example 1: Simple binary encoding for 'Sunrise_Sunset' (Day/Night)
if "Sunrise_Sunset" in df.columns:
    df["IsDay"] = (df["Sunrise_Sunset"] == "Day").astype(int)
    print("1. Pandas Method: Created 'IsDay' column (0 for Night, 1 for Day) from 'Sunrise_Sunset'.")
    print("Sample of 'Sunrise_Sunset' and 'IsDay':")
    print(df[['Sunrise_Sunset', 'IsDay']].head())

# Example 2: Using Scikit-learn's LabelEncoder (for features with many categories)
# Using 'DayOfWeek' as an example for demonstration
column_to_label_encode = 'DayOfWeek_Name'
df[column_to_label_encode] = df['DayOfWeek'].apply(lambda x: days[x]) # Create the string version

le = LabelEncoder()
df['DayOfWeek_Encoded'] = le.fit_transform(df[column_to_label_encode])

print(f"\n2. Scikit-learn Method: Applied LabelEncoder to '{column_to_label_encode}'.")
print("Mapping (a sample):", list(le.classes_), "->", list(range(len(le.classes_))))
print("Sample of Original and Encoded:")
print(df[[column_to_label_encode, 'DayOfWeek_Encoded']].head())

9. How to one-hot encode a categorical column?

In [None]:
# --- Question 9 Code ---
print("\n### 9. One-Hot Encoding a Categorical Column")

# One-hot encoding on 'DayOfWeek_Name' (the string column created in Q8)
column_to_onehot_encode = 'DayOfWeek_Name'

# Use get_dummies for one-hot encoding
df_encoded_day = pd.get_dummies(df[column_to_onehot_encode], prefix='DOW', dummy_na=False)

# Concatenate the new encoded columns back to a sample slice of the main DataFrame for display
df_display = pd.concat([df[[column_to_onehot_encode]].head(), df_encoded_day.head()], axis=1)

print("One-Hot Encoded DataFrame (sample of first 5 rows):")
print(df_display)

print(f"\nCreated {len(df_encoded_day.columns)} new columns, e.g., {list(df_encoded_day.columns)[:3]}...")