In [66]:
# Import dependencies
import os
import pandas as pd
import numpy
import psycopg2
from sqlalchemy import create_engine
from config import username, password
import time

In [34]:
# Import the data
file_path = "resources/US_Accidents_June20.csv"
accident_df = pd.read_csv(file_path)
accident_df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


In [35]:
for column in accident_df.columns:
    print(f"Column {column} has {accident_df[column].isnull().sum()} null values")

Column ID has 0 null values
Column Source has 0 null values
Column TMC has 1034799 null values
Column Severity has 0 null values
Column Start_Time has 0 null values
Column End_Time has 0 null values
Column Start_Lat has 0 null values
Column Start_Lng has 0 null values
Column End_Lat has 2478818 null values
Column End_Lng has 2478818 null values
Column Distance(mi) has 0 null values
Column Description has 1 null values
Column Number has 2262864 null values
Column Street has 0 null values
Column Side has 0 null values
Column City has 112 null values
Column County has 0 null values
Column State has 0 null values
Column Zipcode has 1069 null values
Column Country has 0 null values
Column Timezone has 3880 null values
Column Airport_Code has 6758 null values
Column Weather_Timestamp has 43323 null values
Column Temperature(F) has 65732 null values
Column Wind_Chill(F) has 1868249 null values
Column Humidity(%) has 69687 null values
Column Pressure(in) has 55882 null values
Column Visibility

In [36]:
accident_dropna_df = accident_df.dropna(subset=['Zipcode', 'Temperature(F)', 'Weather_Condition'])

In [37]:
new_accident_df = accident_dropna_df.drop(['ID', 'Source', 'TMC', 'End_Time', 'Airport_Code', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Wind_Chill(F)', 'Wind_Direction', 'Humidity(%)', 'Pressure(in)', 'Description', 'Number', 'Street', 'Side', 'City', 'County', 'Timezone', 'Weather_Timestamp', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight'], axis=1)
new_accident_df.head()

Unnamed: 0,Severity,Start_Time,Start_Lat,Start_Lng,State,Zipcode,Country,Temperature(F),Visibility(mi),Wind_Speed(mph),...,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop
0,3,2016-02-08 05:46:00,39.865147,-84.058723,OH,45424,US,36.9,10.0,,...,False,False,False,False,False,False,False,False,False,False
1,2,2016-02-08 06:07:59,39.928059,-82.831184,OH,43068-3402,US,37.9,10.0,,...,False,False,False,False,False,False,False,False,False,False
2,2,2016-02-08 06:49:27,39.063148,-84.032608,OH,45176,US,36.0,10.0,3.5,...,False,False,False,False,False,False,False,False,True,False
3,3,2016-02-08 07:23:34,39.747753,-84.205582,OH,45417,US,35.1,9.0,4.6,...,False,False,False,False,False,False,False,False,False,False
4,2,2016-02-08 07:39:07,39.627781,-84.188354,OH,45459,US,36.0,6.0,3.5,...,False,False,False,False,False,False,False,False,True,False


In [38]:
new_accident_df.dtypes

Severity               int64
Start_Time            object
Start_Lat            float64
Start_Lng            float64
State                 object
Zipcode               object
Country               object
Temperature(F)       float64
Visibility(mi)       float64
Wind_Speed(mph)      float64
Precipitation(in)    float64
Weather_Condition     object
Amenity                 bool
Bump                    bool
Crossing                bool
Give_Way                bool
Junction                bool
No_Exit                 bool
Railway                 bool
Roundabout              bool
Station                 bool
Stop                    bool
Traffic_Calming         bool
Traffic_Signal          bool
Turning_Loop            bool
dtype: object

In [39]:
# Convert Start_Time to datetypes
new_accident_df['Start_Time'] = pd.to_datetime(new_accident_df['Start_Time'], errors='coerce')

In [40]:
# Extract year, month, day, hour, and weekday
new_accident_df['Year'] = new_accident_df['Start_Time'].dt.year
new_accident_df['Month'] = new_accident_df['Start_Time'].dt.strftime('%B')
new_accident_df['Time'] = new_accident_df['Start_Time'].dt.strftime("%I:%M %p")
new_accident_df['Part_of_Week'] = new_accident_df['Start_Time'].dt.strftime('%a')

In [41]:
new_accident_df = new_accident_df.fillna(0)

In [42]:
new_accident_df['Part_of_Week'] = new_accident_df['Part_of_Week'].str.replace('Mon', 'Weekday')
new_accident_df['Part_of_Week'] = new_accident_df['Part_of_Week'].str.replace('Tue', 'Weekday')
new_accident_df['Part_of_Week'] = new_accident_df['Part_of_Week'].str.replace('Wed', 'Weekday')
new_accident_df['Part_of_Week'] = new_accident_df['Part_of_Week'].str.replace('Thu', 'Weekday')
new_accident_df['Part_of_Week'] = new_accident_df['Part_of_Week'].str.replace('Fri', 'Weekday')
new_accident_df['Part_of_Week'] = new_accident_df['Part_of_Week'].str.replace('Sat', 'Weekend')
new_accident_df['Part_of_Week'] = new_accident_df['Part_of_Week'].str.replace('Sun', 'Weekend')

In [43]:
new_accident_df

Unnamed: 0,Severity,Start_Time,Start_Lat,Start_Lng,State,Zipcode,Country,Temperature(F),Visibility(mi),Wind_Speed(mph),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Year,Month,Time,Part_of_Week
0,3,2016-02-08 05:46:00,39.865147,-84.058723,OH,45424,US,36.9,10.0,0.0,...,False,False,False,False,False,False,2016,February,05:46 AM,Weekday
1,2,2016-02-08 06:07:59,39.928059,-82.831184,OH,43068-3402,US,37.9,10.0,0.0,...,False,False,False,False,False,False,2016,February,06:07 AM,Weekday
2,2,2016-02-08 06:49:27,39.063148,-84.032608,OH,45176,US,36.0,10.0,3.5,...,False,False,False,False,True,False,2016,February,06:49 AM,Weekday
3,3,2016-02-08 07:23:34,39.747753,-84.205582,OH,45417,US,35.1,9.0,4.6,...,False,False,False,False,False,False,2016,February,07:23 AM,Weekday
4,2,2016-02-08 07:39:07,39.627781,-84.188354,OH,45459,US,36.0,6.0,3.5,...,False,False,False,False,True,False,2016,February,07:39 AM,Weekday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513612,2,2019-08-23 18:03:25,34.002480,-117.379360,CA,92501,US,86.0,10.0,13.0,...,False,False,False,False,False,False,2019,August,06:03 PM,Weekday
3513613,2,2019-08-23 19:11:30,32.766960,-117.148060,CA,92108,US,70.0,10.0,6.0,...,False,False,False,False,False,False,2019,August,07:11 PM,Weekday
3513614,2,2019-08-23 19:00:21,33.775450,-117.847790,CA,92866,US,73.0,10.0,10.0,...,False,False,False,False,False,False,2019,August,07:00 PM,Weekday
3513615,2,2019-08-23 19:00:21,33.992460,-118.403020,CA,90230,US,71.0,10.0,8.0,...,False,False,False,False,False,False,2019,August,07:00 PM,Weekday


In [46]:
cleaned_df = new_accident_df.drop(['Start_Time', 'Amenity', 'Station'], axis=1)

In [47]:
cleaned_df

Unnamed: 0,Severity,Start_Lat,Start_Lng,State,Zipcode,Country,Temperature(F),Visibility(mi),Wind_Speed(mph),Precipitation(in),...,Railway,Roundabout,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Year,Month,Time,Part_of_Week
0,3,39.865147,-84.058723,OH,45424,US,36.9,10.0,0.0,0.02,...,False,False,False,False,False,False,2016,February,05:46 AM,Weekday
1,2,39.928059,-82.831184,OH,43068-3402,US,37.9,10.0,0.0,0.00,...,False,False,False,False,False,False,2016,February,06:07 AM,Weekday
2,2,39.063148,-84.032608,OH,45176,US,36.0,10.0,3.5,0.00,...,False,False,False,False,True,False,2016,February,06:49 AM,Weekday
3,3,39.747753,-84.205582,OH,45417,US,35.1,9.0,4.6,0.00,...,False,False,False,False,False,False,2016,February,07:23 AM,Weekday
4,2,39.627781,-84.188354,OH,45459,US,36.0,6.0,3.5,0.00,...,False,False,False,False,True,False,2016,February,07:39 AM,Weekday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513612,2,34.002480,-117.379360,CA,92501,US,86.0,10.0,13.0,0.00,...,False,False,False,False,False,False,2019,August,06:03 PM,Weekday
3513613,2,32.766960,-117.148060,CA,92108,US,70.0,10.0,6.0,0.00,...,False,False,False,False,False,False,2019,August,07:11 PM,Weekday
3513614,2,33.775450,-117.847790,CA,92866,US,73.0,10.0,10.0,0.00,...,False,False,False,False,False,False,2019,August,07:00 PM,Weekday
3513615,2,33.992460,-118.403020,CA,90230,US,71.0,10.0,8.0,0.00,...,False,False,False,False,False,False,2019,August,07:00 PM,Weekday


In [53]:
cleaned_df.rename(columns = {'Start_Lat':'Latitude', 'Start_Lng':'Longitude', 'Wind_Speed(mph)': 'Wind Speed(mph)',
                              'Weather_Condition':'Weather Condition', 'Bump': 'Speed Bump', 'Crossing': 'Cross Walk',
                               'Give_Way': 'Yield Sign', 'Junction': 'Intersection', 'No_Exit': 'No Exit', 'Stop': 'Stop Sign',
                               'Traffic_Calming': 'Traffic', 'Traffic_Signal': 'Traffic Light', 'Turning_Loop': 'Turning Loop',
                                'Part_of_Week': 'Part of Week'}, inplace = True) 

In [54]:
cleaned_df

Unnamed: 0,Severity,Latitude,Longitude,State,Zipcode,Country,Temperature(F),Visibility(mi),Wind Speed(mph),Precipitation(in),...,Railway,Roundabout,Stop Sign,Traffic,Traffic Light,Turning Loop,Year,Month,Time,Part of Week
0,3,39.865147,-84.058723,OH,45424,US,36.9,10.0,0.0,0.02,...,False,False,False,False,False,False,2016,February,05:46 AM,Weekday
1,2,39.928059,-82.831184,OH,43068-3402,US,37.9,10.0,0.0,0.00,...,False,False,False,False,False,False,2016,February,06:07 AM,Weekday
2,2,39.063148,-84.032608,OH,45176,US,36.0,10.0,3.5,0.00,...,False,False,False,False,True,False,2016,February,06:49 AM,Weekday
3,3,39.747753,-84.205582,OH,45417,US,35.1,9.0,4.6,0.00,...,False,False,False,False,False,False,2016,February,07:23 AM,Weekday
4,2,39.627781,-84.188354,OH,45459,US,36.0,6.0,3.5,0.00,...,False,False,False,False,True,False,2016,February,07:39 AM,Weekday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513612,2,34.002480,-117.379360,CA,92501,US,86.0,10.0,13.0,0.00,...,False,False,False,False,False,False,2019,August,06:03 PM,Weekday
3513613,2,32.766960,-117.148060,CA,92108,US,70.0,10.0,6.0,0.00,...,False,False,False,False,False,False,2019,August,07:11 PM,Weekday
3513614,2,33.775450,-117.847790,CA,92866,US,73.0,10.0,10.0,0.00,...,False,False,False,False,False,False,2019,August,07:00 PM,Weekday
3513615,2,33.992460,-118.403020,CA,90230,US,71.0,10.0,8.0,0.00,...,False,False,False,False,False,False,2019,August,07:00 PM,Weekday


In [55]:
# Create the output file (CSV)
output_data_file = "Resources/Cleaned_Accidents_Data.csv"
cleaned_df.to_csv(output_data_file, index=False)