In [1]:
import pandas as pd
import streamlit as st
import plotly.express as px

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [40]:
ds=pd.read_csv("/content/drive/My Drive/Infosys Internship/US_Accidents_March23.csv")

In [10]:
def plt_accident_details(ds,location_col='State',sample_size=50000):
    counts=ds[location_col].value_counts().head(5)
    top5_locations=counts.index.tolist()
    ds['is_top5']=ds[location_col].isin(top5_locations)

    if len(ds)>sample_size:
        ds_sample=ds.sample(sample_size,random_state=42)
    else:
        ds_sample=ds

    fig=px.scatter(
        ds_sample,
        x='Start_Lng',
        y='Start_Lat',
        color='is_top5',
        labels={'Start_Lng':'Longitude','Start_Lat':'Latitude','is_top5':'Top5 Accidents-prone'},
        title=f'Accident Locations Highlighting Top 5 {location_col}s',
        opacity=0.6,
        hover_data=[location_col]
        )
    st.plotly_chart(fig)
st.title('Accident Data Visualization')

location_option=st.selectbox('Select location Type',['State','City'])

plt_accident_details(ds,location_col=location_option)



In [15]:
ds.dtypes

Unnamed: 0,0
ID,object
Source,object
Severity,int64
Start_Time,object
End_Time,object
Start_Lat,float64
Start_Lng,float64
End_Lat,float64
End_Lng,float64
Distance(mi),float64


In [42]:
# Hypothesis 1: What time of day has the most accidents?
ds['Start_Time'] = pd.to_datetime(ds['Start_Time'], errors="coerce")

# Extract hour from datetime
ds['Hour'] = ds['Start_Time'].dt.hour

# Count accidents by hour (this is what you want!)
hourly_accidents = ds['Hour'].value_counts().sort_index()

# Find the peak hour
peak_hour = hourly_accidents.idxmax()
peak_count = hourly_accidents.max()
print(f"\nPeak hour: {peak_hour}:00")
print(f"Number of accidents at peak: {peak_count:,}")


Peak hour: 7.0:00
Number of accidents at peak: 546,789


In [44]:
# Hypothesis 2: Are accidents more severe during rain or fog?


# Filter for rain and fog accidents
rain_accidents = ds[ds['Weather_Condition'].str.contains('rain|Rain|RAIN', na=False, case=False)]
fog_accidents = ds[ds['Weather_Condition'].str.contains('fog|Fog|FOG|haze|Haze', na=False, case=False)]

print(f"🌧️  Rain-related accidents: {len(rain_accidents):,}")
print(f"🌫️  Fog-related accidents: {len(fog_accidents):,}")

# Calculate average severity
if len(rain_accidents) > 0 and len(fog_accidents) > 0:
    rain_severity = rain_accidents['Severity'].mean()
    fog_severity = fog_accidents['Severity'].mean()

    print(f"\n📊 AVERAGE SEVERITY:")
    print(f"Rain accidents: {rain_severity:.2f}")
    print(f"Fog accidents: {fog_severity:.2f}")

    # Compare and conclude
    print(f"\n🎯 CONCLUSION:")
    if rain_severity > fog_severity:
        print(f"✅ Accidents are MORE SEVERE during RAIN")
        difference = rain_severity - fog_severity
        print(f"   (Difference: +{difference:.2f} severity points)")
    elif fog_severity > rain_severity:
        print(f"✅ Accidents are MORE SEVERE during FOG")
        difference = fog_severity - rain_severity
        print(f"   (Difference: +{difference:.2f} severity points)")
    else:
        print("ℹ️  Accidents are equally severe during rain and fog")
else:
    print("❌ Not enough data for comparison")

🌧️  Rain-related accidents: 509,086
🌫️  Fog-related accidents: 186,474

📊 AVERAGE SEVERITY:
Rain accidents: 2.25
Fog accidents: 2.18

🎯 CONCLUSION:
✅ Accidents are MORE SEVERE during RAIN
   (Difference: +0.08 severity points)


In [45]:
# Hypothesis 3: Is there a correlation between visibility and severity?

# Check if the required columns exist and have data
print("Checking data availability:")
print(f"Visibility column present: {'Visibility(mi)' in ds.columns}")
print(f"Severity column present: {'Severity' in ds.columns}")

if 'Visibility(mi)' in ds.columns and 'Severity' in ds.columns:
    # Clean the data - remove missing values and outliers
    visibility_data = ds[['Visibility(mi)', 'Severity']].copy()

    print(f"\nOriginal data points: {len(visibility_data):,}")

    # Remove rows with missing values
    visibility_data = visibility_data.dropna()
    print(f"After removing missing values: {len(visibility_data):,}")

    # Remove unrealistic visibility values (negative or extremely high)
    visibility_data = visibility_data[
        (visibility_data['Visibility(mi)'] >= 0) &
        (visibility_data['Visibility(mi)'] <= 50)  # Reasonable visibility range
    ]
    print(f"After removing outliers: {len(visibility_data):,}")

    # Calculate correlation coefficient
    correlation = visibility_data['Visibility(mi)'].corr(visibility_data['Severity'])

    print(f"\n📊 CORRELATION COEFFICIENT: {correlation:.4f}")

    # Interpret the correlation strength
    print(f"\n💪 CORRELATION STRENGTH:")
    if abs(correlation) < 0.1:
        strength = "Very Weak"
    elif abs(correlation) < 0.3:
        strength = "Weak"
    elif abs(correlation) < 0.5:
        strength = "Moderate"
    elif abs(correlation) < 0.7:
        strength = "Strong"
    else:
        strength = "Very Strong"

    print(f"Strength: {strength}")

    # Interpret the direction
    print(f"\n🧭 CORRELATION DIRECTION:")
    if correlation > 0:
        direction = "Positive"
        interpretation = "Higher visibility → Higher severity"
    elif correlation < 0:
        direction = "Negative"
        interpretation = "Lower visibility → Higher severity"
    else:
        direction = "No correlation"
        interpretation = "No relationship"

    print(f"Direction: {direction}")
    print(f"Meaning: {interpretation}")

    # Final conclusion
    print(f"\n🎯 CONCLUSION:")
    if abs(correlation) > 0.1:  # Meaningful correlation
        if correlation < 0:
            print("✅ YES: There is a correlation - Lower visibility is associated with HIGHER severity accidents")
        else:
            print("✅ YES: There is a correlation - Higher visibility is associated with HIGHER severity accidents")
    else:
        print("❌ NO: There is no meaningful correlation between visibility and accident severity")

else:
    print("❌ Required columns not found in dataset")

Checking data availability:
Visibility column present: True
Severity column present: True

Original data points: 7,728,394
After removing missing values: 7,551,296
After removing outliers: 7,550,267

📊 CORRELATION COEFFICIENT: -0.0043

💪 CORRELATION STRENGTH:
Strength: Very Weak

🧭 CORRELATION DIRECTION:
Direction: Negative
Meaning: Lower visibility → Higher severity

🎯 CONCLUSION:
❌ NO: There is no meaningful correlation between visibility and accident severity
