In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

dataset_path = r"D:\Infosys SpringBoard Virtual Internship 6.0\US_Accidents_March23.csv"

try:
    accidents_df = pd.read_csv(dataset_path)
    print(f"Dataset loaded successfully with shape: {accidents_df.shape}")
except FileNotFoundError:
    print(f"Dataset file not found at the path: {dataset_path}")


Dataset loaded successfully with shape: (7728394, 46)


In [2]:
display(accidents_df.head())
print(f"\nDataset has {accidents_df.shape[0]} rows and {accidents_df.shape[1]} columns")


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day



Dataset has 7728394 rows and 46 columns


In [3]:
print("\nColumn Data Types:")
print(accidents_df.dtypes)

print("\nDataset Info:")
accidents_df.info()



Column Data Types:
ID                        object
Source                    object
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
End_Lat                  float64
End_Lng                  float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Weather_Timestamp         object
Temperature(F)           float64
Wind_Chill(F)            float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Precipitation(in)        float64
Weather_Condition         object
Amenity                

In [4]:
missing_values = accidents_df.isnull().sum()
missing_percent = (missing_values / len(accidents_df)) * 100

missing_summary = pd.DataFrame({
    'Missing Values': missing_values,
    'Percent Missing': missing_percent
})

missing_summary = missing_summary[missing_summary['Missing Values'] > 0].sort_values(
    by='Missing Values', ascending=False)

print("\nColumns with Missing Data:")
display(missing_summary)



Columns with Missing Data:


Unnamed: 0,Missing Values,Percent Missing
End_Lat,3402762,44.029355
End_Lng,3402762,44.029355
Precipitation(in),2203586,28.512858
Wind_Chill(F),1999019,25.865904
Wind_Speed(mph),571233,7.391355
Visibility(mi),177098,2.291524
Wind_Direction,175206,2.267043
Humidity(%),174144,2.253301
Weather_Condition,173459,2.244438
Temperature(F),163853,2.120143


In [5]:
print("\nAccident Severity Distribution (counts):")
print(accidents_df['Severity'].value_counts().sort_index())

print("\nAccident Severity Distribution (percent):")
print((accidents_df['Severity'].value_counts(normalize=True) * 100).sort_index())



Accident Severity Distribution (counts):
Severity
1      67366
2    6156981
3    1299337
4     204710
Name: count, dtype: int64

Accident Severity Distribution (percent):
Severity
1     0.871669
2    79.667017
3    16.812510
4     2.648804
Name: proportion, dtype: float64


In [6]:
print("\nTop 20 Weather Conditions during accidents:")
print(accidents_df['Weather_Condition'].value_counts().head(20))



Top 20 Weather Conditions during accidents:
Weather_Condition
Fair                       2560802
Mostly Cloudy              1016195
Cloudy                      817082
Clear                       808743
Partly Cloudy               698972
Overcast                    382866
Light Rain                  352957
Scattered Clouds            204829
Light Snow                  128680
Fog                          99238
Rain                         84331
Haze                         76223
Fair / Windy                 35671
Heavy Rain                   32309
Light Drizzle                22684
Thunder in the Vicinity      17611
Cloudy / Windy               17035
T-Storm                      16810
Mostly Cloudy / Windy        16508
Snow                         15537
Name: count, dtype: int64


In [9]:
# Convert Start_Time to datetime
accidents_df['Start_Time'] = pd.to_datetime(accidents_df['Start_Time'], errors='coerce')

# How many failed to parse? (useful to know)
print("Rows with invalid Start_Time (NaT):", accidents_df['Start_Time'].isna().sum())

# Extract hour
accidents_df['Hour'] = accidents_df['Start_Time'].dt.hour

print("\nSample Start_Time and extracted Hour:")
display(accidents_df[['Start_Time', 'Hour']].head())


Rows with invalid Start_Time (NaT): 743166

Sample Start_Time and extracted Hour:


Unnamed: 0,Start_Time,Hour
0,2016-02-08 05:46:00,5.0
1,2016-02-08 06:07:59,6.0
2,2016-02-08 06:49:27,6.0
3,2016-02-08 07:23:34,7.0
4,2016-02-08 07:39:07,7.0


In [8]:
hourly_distribution = accidents_df['Hour'].value_counts().sort_index()
print("Accident Frequency by Hour of the Day:\n", hourly_distribution)


Accident Frequency by Hour of the Day:
 Hour
0.0      98452
1.0      85743
2.0      82394
3.0      74229
4.0     149077
5.0     209579
6.0     375179
7.0     546789
8.0     541643
9.0     334067
10.0    313625
11.0    322215
12.0    316904
13.0    352361
14.0    394697
15.0    463389
16.0    520177
17.0    516626
18.0    390621
19.0    267045
20.0    201883
21.0    169500
22.0    148605
23.0    110428
Name: count, dtype: int64
