In [5]:
"""
Assignment-02: US RoadSafe Analytics
Finalized Code with Error Corrections
"""

# Install necessary libraries if not already installed
# !pip install pandas plotly scikit-learn matplotlib seaborn scipy

import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency, pearsonr
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# --- Data Loading ---
try:
    # Use the original path provided or a placeholder for Colab environment
    df = pd.read_csv("C:/Users/win10/Desktop/US_Accidents_March23.csv", nrows=40000)
    print("Data loaded successfully.")
    data_source = "Real"
except FileNotFoundError:
    print("Error: 'US_Accidents_March23.csv' not found. Using dummy data for demonstration.")
    # Create a dummy DataFrame for demonstration if the file is not found
    data = {
        'Severity': np.random.randint(1, 5, 40000),
        'Weather_Condition': np.random.choice(['Clear', 'Rain', 'Fog', 'Snow'], 40000),
        'Start_Time': pd.to_datetime(pd.to_datetime('2023-01-01') + pd.to_timedelta(np.random.randint(0, 86400 * 365, 40000), unit='s')),
        'Hour': np.random.randint(0, 24, 40000),
        'Temperature(F)': np.random.uniform(-20, 120, 40000),
        'Visibility(mi)': np.random.uniform(0, 100, 40000),
        'Bump': np.random.choice([0, 1], 40000),
        'Crossing': np.random.choice([0, 1], 40000),
        'Junction': np.random.choice([0, 1], 40000),
        'Traffic_Signal': np.random.choice([0, 1], 40000),
        'Give_Way': np.random.choice([0, 1], 40000),
        'No_Exit': np.random.choice([0, 1], 40000),
        'Railway': np.random.choice([0, 1], 40000),
        'Roundabout': np.random.choice([0, 1], 40000),
        'Station': np.random.choice([0, 1], 40000),
        'Stop': np.random.choice([0, 1], 40000),
        'Traffic_Calming': np.random.choice([0, 1], 40000),
        'Turning_Loop': np.random.choice([0, 1], 40000),
    }
    df = pd.DataFrame(data)
    data_source = "Dummy"
    print("Using dummy data for demonstration.")

# --- 1. Formulate Analytical Questions & Hypotheses (No Change) ---
# ... (Hypotheses remain the same) ...

# --- 2. Data Filtering and Grouping ---

print("\n--- Data Filtering and Grouping ---")

# --- For Question 1: Effect of Weather Conditions ---
weather_groups = df.groupby("Weather_Condition")["Severity"].mean().sort_values(ascending=False).head(10)
print("Average Severity by Top 10 Weather Conditions:\n", weather_groups)

# Filter for specific weather conditions for t-test
clear_severity = df[df["Weather_Condition"] == "Clear"]["Severity"].dropna()
rain_severity = df[df["Weather_Condition"] == "Rain"]["Severity"].dropna()

# --- For Question 2: Accident Frequency by Hour of Day ---
# Ensure 'Hour' column is present
if 'Start_Time' in df.columns and data_source == "Real":
    # Recalculate 'Hour' if 'Start_Time' exists in the real data
    df['Hour'] = pd.to_datetime(df['Start_Time'], errors='coerce').dt.hour
elif 'Hour' not in df.columns and data_source == "Dummy":
    # Safety check for dummy data
    df['Hour'] = np.random.randint(0, 24, len(df))

hourly_counts = df.groupby('Hour').size() # DEFINED FOR VISUALIZATION

rush_hours_accidents = df[df['Hour'].between(7, 9)].shape[0]
night_hours_accidents = df[df['Hour'].between(0,3)].shape[0]
print(f"\nAccidents 7-9am: {rush_hours_accidents}, 12-3am: {night_hours_accidents}")

# --- For Question 3: Correlation Between Temperature and Accident Severity ---
# FIX: Synchronize NaNs for correlation
temp_severity_df = df[["Temperature(F)", "Severity"]].dropna()
temp_data_corr = temp_severity_df["Temperature(F)"]
severity_data_corr = temp_severity_df["Severity"]

# --- For Question 4: Impact of Low Visibility ---
visibility_bins = [0, 1, 2, 5, 10, 20, df["Visibility(mi)"].max() if df["Visibility(mi)"].max() > 20 else 100] # Safe max
visibility_labels = ["<1mi", "1-2mi", "2-5mi", "5-10mi", "10-20mi", ">20mi"]
df['Visibility_Range'] = pd.cut(df["Visibility(mi)"], bins=visibility_bins, labels=visibility_labels, include_lowest=True)
df['Low_Visibility'] = df['Visibility_Range'].isin(["<1mi", "1-2mi"])
print("\nLow Visibility counts:\n", df['Low_Visibility'].value_counts())

visibility_counts = df['Visibility_Range'].value_counts().sort_index() # DEFINED FOR VISUALIZATION

# --- For Question 5: Effect of Road Features ---
road_features = ['Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
                 'Railway', 'Roundabout', 'Station', 'Stop',
                 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']
existing_features = [feat for feat in road_features if feat in df.columns]

# --- 3. Statistical Hypothesis Testing ---

print("\n--- Statistical Hypothesis Testing Results ---")

# --- Question 1: Effect of Weather Conditions on Accident Severity (t-test) ---
print("\n--- Question 1: Effect of Weather Conditions on Accident Severity ---")
if len(clear_severity) > 1 and len(rain_severity) > 1:
    _, p_weather = ttest_ind(clear_severity, rain_severity, nan_policy='omit')
    print(f"Null Hypothesis (H0): Mean severity is the same for Clear and Rain conditions.")
    print(f"P-value: {p_weather:.4f}")
    if p_weather < 0.05:
        print(f"Conclusion: Reject H0. Significant difference found (p={p_weather:.4f}). Weather impacts severity.")
    else:
        print(f"Conclusion: Fail to Reject H0. No significant difference (p={p_weather:.4f}).")
else:
    print("Insufficient data for t-test between 'Clear' and 'Rain'.")

# --- Question 2: Accident Frequency by Hour of Day (Direct Comparison) ---
print("\n--- Question 2: Accident Frequency by Hour of Day ---")
print(f"Accidents 7-9am: {rush_hours_accidents}, 12-3am: {night_hours_accidents}")
if rush_hours_accidents > night_hours_accidents:
    print("Conclusion: More accidents during morning rush hours (supports H1 qualitatively).")
else:
    print("Conclusion: More accidents not observed during morning rush hours (does not support H1 qualitatively).")

# --- Question 3: Correlation Between Temperature and Accident Severity (Pearson Correlation) ---
print("\n--- Question 3: Correlation Between Temperature and Accident Severity ---")
# Use corrected, synchronized data
if len(temp_data_corr) > 1 and len(severity_data_corr) > 1:
    corr, p_corr = pearsonr(temp_data_corr, severity_data_corr)
    print(f"Null Hypothesis (H0): There is no linear correlation between temperature and accident severity.")
    print(f"Pearson correlation: {corr:.3f} (p={p_corr:.4e})")
    if p_corr < 0.05:
        print(f"Conclusion: Reject H0. Significant correlation found (p={p_corr:.4e}). {'Weak' if abs(corr)<0.3 else 'Moderate/Strong'} relationship.")
    else:
        print(f"Conclusion: Fail to Reject H0. No significant correlation (p={p_corr:.4e}).")
else:
    print("Insufficient synchronized data for Pearson correlation between Temperature and Severity.")

# --- Question 4: Impact of Low Visibility on Accident Frequency/Severity (Chi-square test) ---
print("\n--- Question 4: Impact of Low Visibility on Accident Frequency/Severity ---")
contingency_table = pd.crosstab(df['Low_Visibility'], df['Severity'])
if contingency_table.sum().sum() > 0 and min(contingency_table.shape) > 1: # Check if table is not empty/degenerate
    _, p_vis, _, _ = chi2_contingency(contingency_table)
    print(f"Null Hypothesis (H0): Low visibility is independent of accident severity.")
    print(f"P-value: {p_vis:.4f}")
    if p_vis < 0.05:
        print(f"Conclusion: Reject H0. Significant association between low visibility and accident severity (p={p_vis:.4f}).")
    else:
        print(f"Conclusion: Fail to Reject H0. No significant association found (p={p_vis:.4f}).")
else:
    print("Insufficient data for Chi-square test on Low Visibility and Severity.")

# --- Question 5: Effect of Road Features on Accident Severity (t-test for each feature) ---
print("\n--- Question 5: Effect of Road Features on Accident Severity ---")
if existing_features:
    for feat in existing_features:
        with_feat = df[df[feat] == 1]["Severity"].dropna()
        without_feat = df[df[feat] == 0]["Severity"].dropna()

        if len(with_feat) > 10 and len(without_feat) > 10: # Ensure enough data for test
            _, p_feat = ttest_ind(with_feat, without_feat, nan_policy='omit')
            print(f"\nFeature: {feat}")
            print(f"Null Hypothesis (H0): Mean severity is the same with and without {feat}.")
            print(f"P-value: {p_feat:.4f}")
            if p_feat < 0.05:
                print(f"Conclusion: Reject H0. Road feature significantly affects severity (p={p_feat:.4f}).")
            else:
                print(f"Conclusion: Fail to Reject H0. No significant impact on severity (p={p_feat:.4f}).")
        else:
            print(f"\nFeature: {feat} - Insufficient data to test hypothesis.")
else:
    print("No road feature columns found in data for this insight.")

# --- 4. Visualize Results ---

print("\n--- Visualizations (Plotly figures are generated but not displayed in this environment) ---")

# --- Visualization for Question 1: Effect of Weather Conditions ---
fig_weather_severity = px.bar(weather_groups, x=weather_groups.index, y=weather_groups.values,
                              title="Average Accident Severity by Top 10 Weather Conditions",
                              labels={"x": "Weather Condition", "y": "Average Severity"})
# fig_weather_severity.show()
print("Generated fig_weather_severity.")

# --- Visualization for Question 2: Accident Frequency by Hour of Day ---
if not hourly_counts.empty:
    fig_hourly_counts = px.line(hourly_counts, x=hourly_counts.index, y=hourly_counts.values,
                                title="Accident Frequency by Hour of Day",
                                labels={"x": "Hour of Day", "y": "Number of Accidents"})
    # fig_hourly_counts.show()
    print("Generated fig_hourly_counts.")

# --- Visualization for Question 3: Correlation Between Temperature and Accident Severity ---
fig_temp_severity = px.scatter(df.dropna(subset=["Temperature(F)", "Severity"]),
                               x="Temperature(F)", y="Severity", color="Severity",
                               title="Scatterplot of Temperature vs Severity",
                               labels={"Temperature(F)": "Temperature (F)", "Severity": "Accident Severity"})
# fig_temp_severity.show()
print("Generated fig_temp_severity.")

# --- Visualization for Question 4: Impact of Low Visibility ---
if 'Visibility_Range' in df.columns and not visibility_counts.empty:
    fig_visibility = px.bar(visibility_counts, x=visibility_counts.index.astype(str), y=visibility_counts.values,
                            title="Accident Counts by Visibility Range",
                            labels={"x": "Visibility Range (miles)", "y": "Number of Accidents"})
    # fig_visibility.show()
    print("Generated fig_visibility.")
else:
    print("Visibility_Range column not found or is empty for visualization.")

# --- Visualization for Question 5: Effect of Road Features ---
if existing_features:
    feature_counts = {}
    for feat in existing_features:
        # Count where feature is True/1
        count = df[df[feat].isin([1, True])].shape[0] if df[feat].dtype != 'bool' else df[df[feat]].shape[0]
        feature_counts[feat] = count
    feature_series = pd.Series(feature_counts).sort_values(ascending=False).nlargest(5)

    fig_features = px.bar(feature_series, x=feature_series.index, y=feature_series.values,
                          labels={"x":"Road Feature", "y":"Accident Count"},
                          title="Top 5 Road Surface / Feature Conditions",
                          color=feature_series.values,
                          color_continuous_scale='Turbo')
    # fig_features.show()
    print("Generated fig_features.")
else:
    print("No road feature columns found for visualization.")

print("\n--- Code Execution Complete ---")

Data loaded successfully.

--- Data Filtering and Grouping ---
Average Severity by Top 10 Weather Conditions:
 Weather_Condition
Light Freezing Rain             3.000000
Light Freezing Fog              3.000000
Shallow Fog                     2.750000
Snow                            2.714286
Light Drizzle                   2.666667
Light Thunderstorms and Rain    2.666667
Thunderstorms and Rain          2.500000
Heavy Rain                      2.500000
Partly Cloudy                   2.476863
Light Rain                      2.442181
Name: Severity, dtype: float64

Accidents 7-9am: 4529, 12-3am: 2805

Low Visibility counts:
 Low_Visibility
False    39042
True       958
Name: count, dtype: int64

--- Statistical Hypothesis Testing Results ---

--- Question 1: Effect of Weather Conditions on Accident Severity ---
Null Hypothesis (H0): Mean severity is the same for Clear and Rain conditions.
P-value: 0.1248
Conclusion: Fail to Reject H0. No significant difference (p=0.1248).

--- Question 