In [1]:
# Import dependencies
import pandas as pd
import numpy as np

# Data Extraction

In [2]:
# Extract Kaggle Data

kaggle_metadata = pd.read_csv('./US_Accidents_Dec19.tar.gz', compression='gzip', error_bad_lines=False)


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
### Turn off SettingWithCopyWarning ##############
pd.options.mode.chained_assignment = None

In [4]:
# Check total rows extraced
len(kaggle_metadata)

2974336

In [5]:
# Extract only Columns relevant for analysis
df_subset = kaggle_metadata[['US_Accidents_Dec19.csv','Severity','Start_Time','End_Time',
                             'Start_Lat','Start_Lng','Distance(mi)', 'Street','Side','City',
                             'County','State','Zipcode','Timezone', 
                             'Temperature(F)','Humidity(%)','Pressure(in)',
                             'Visibility(mi)','Wind_Direction','Wind_Speed(mph)','Precipitation(in)',
                             'Weather_Condition','Amenity','Crossing','Junction','Railway',
                             'Station','Stop','Traffic_Signal','Civil_Twilight'
                            ]]

# Transform

In [6]:
# Check Null Values
df_subset.isnull().sum()

US_Accidents_Dec19.csv          1
Severity                        1
Start_Time                      1
End_Time                        1
Start_Lat                       1
Start_Lng                       1
Distance(mi)                    1
Street                          1
Side                            1
City                           84
County                          1
State                           1
Zipcode                       881
Timezone                     3164
Temperature(F)              56064
Humidity(%)                 59174
Pressure(in)                48143
Visibility(mi)              65692
Wind_Direction              45102
Wind_Speed(mph)            440841
Precipitation(in)         1998359
Weather_Condition           65933
Amenity                         1
Crossing                        1
Junction                        1
Railway                         1
Station                         1
Stop                            1
Traffic_Signal                  1
Civil_Twilight

In [7]:
# Fill NA with zero values for Precipitation column
df_subset["Precipitation(in)"].fillna(0, inplace = True) 

In [8]:
# Drop rows with other NA values
df_subset.dropna(inplace=True)

In [9]:
# Check the resulting dataset length
len(df_subset)

2506618

In [10]:
# Sort the dataframe on Severity so that when removing duplicates the one with higher severity is retained
sorted_df = df_subset.sort_values('Severity',ascending=False)

In [11]:
# Check how many duplicates exist in the dataset
len(sorted_df[['Severity', 'Start_Time', 'Start_Lat', 'Start_Lng']].drop_duplicates())

2494218

In [12]:
# Remove duplicates
sorted_df.drop_duplicates(subset=['Severity', 'Start_Time', 'Start_Lat', 'Start_Lng'], inplace = True) 

In [13]:
# Check length of the remaining dataset after removing duplicate and Null value rows
len(sorted_df)

2494218

In [14]:
# check datatypes for corrections
sorted_df.dtypes

US_Accidents_Dec19.csv     object
Severity                  float64
Start_Time                 object
End_Time                   object
Start_Lat                 float64
Start_Lng                 float64
Distance(mi)              float64
Street                     object
Side                       object
City                       object
County                     object
State                      object
Zipcode                    object
Timezone                   object
Temperature(F)            float64
Humidity(%)               float64
Pressure(in)              float64
Visibility(mi)            float64
Wind_Direction             object
Wind_Speed(mph)           float64
Precipitation(in)         float64
Weather_Condition          object
Amenity                    object
Crossing                   object
Junction                   object
Railway                    object
Station                    object
Stop                       object
Traffic_Signal             object
Civil_Twilight

#### Split Date and Time Column into Date, Time, Time in Seconds, and Day of week columns

In [15]:
# new data frame with split value Start date time column
newstart = sorted_df["Start_Time"].str.split(" ",expand = True) 
  
# making separate Start Time column from new data frame 
sorted_df["Start_Time_of_Day"]= newstart[1] 

In [16]:
# new data frame with split value End date time column
newend = sorted_df["End_Time"].str.split(" ",expand = True) 

# making separate Start Time column from new data frame 
sorted_df["End_Time_of_Day"]= newend[1] 

In [17]:
# Convert Time to seconds for Start Time and End Time
sorted_df['Start_seconds'] = pd.to_timedelta(sorted_df['Start_Time_of_Day']).dt.seconds

# Convert Time to seconds for Start Time and End Time
sorted_df['End_seconds'] = pd.to_timedelta(sorted_df['End_Time_of_Day']).dt.seconds

In [18]:
sorted_df['Start_Time'] = pd.to_datetime(sorted_df.Start_Time)
sorted_df['End_Time'] = pd.to_datetime(sorted_df.End_Time)

In [19]:
# Get Day of the week for the accident
sorted_df['Day_of_Week'] = sorted_df['Start_Time'].dt.weekday

In [20]:
sorted_df.head(5)

Unnamed: 0,US_Accidents_Dec19.csv,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Street,Side,City,...,Railway,Station,Stop,Traffic_Signal,Civil_Twilight,Start_Time_of_Day,End_Time_of_Day,Start_seconds,End_seconds,Day_of_Week
2782693,A-2782717,4.0,2018-05-22 15:54:12,2018-05-22 21:54:12,32.142385,-81.176194,0.402,GA Highway 21,L,Port Wentworth,...,False,False,False,False,Day,15:54:12,21:54:12,57252,78852,1
2584250,A-2584274,4.0,2019-10-01 20:52:52,2019-10-01 21:22:24,48.9644,-122.441456,0.475,E Badger Rd,L,Lynden,...,False,False,False,False,Night,20:52:52,21:22:24,75172,76944,1
2455305,A-2455329,4.0,2017-05-20 14:59:21,2017-05-20 20:59:21,47.70142,-122.34466,0.189,N 100th St,R,Seattle,...,False,False,False,True,Day,14:59:21,20:59:21,53961,75561,5
2888677,A-2888701,4.0,2017-08-05 14:10:19,2017-08-05 20:10:19,42.1202,-73.890961,1.251,State Route 9G,L,Germantown,...,False,False,False,False,Day,14:10:19,20:10:19,51019,72619,5
2455307,A-2455331,4.0,2017-05-20 17:07:32,2017-05-20 23:07:32,48.0718,-122.11138,1.272,WA-9,R,Marysville,...,False,False,False,False,Day,17:07:32,23:07:32,61652,83252,5


In [21]:
sorted_df.dtypes

US_Accidents_Dec19.csv            object
Severity                         float64
Start_Time                datetime64[ns]
End_Time                  datetime64[ns]
Start_Lat                        float64
Start_Lng                        float64
Distance(mi)                     float64
Street                            object
Side                              object
City                              object
County                            object
State                             object
Zipcode                           object
Timezone                          object
Temperature(F)                   float64
Humidity(%)                      float64
Pressure(in)                     float64
Visibility(mi)                   float64
Wind_Direction                    object
Wind_Speed(mph)                  float64
Precipitation(in)                float64
Weather_Condition                 object
Amenity                           object
Crossing                          object
Junction        

In [22]:
sorted_df[sorted_df['US_Accidents_Dec19.csv'] == 'A-2782717']

Unnamed: 0,US_Accidents_Dec19.csv,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Street,Side,City,...,Railway,Station,Stop,Traffic_Signal,Civil_Twilight,Start_Time_of_Day,End_Time_of_Day,Start_seconds,End_seconds,Day_of_Week
2782693,A-2782717,4.0,2018-05-22 15:54:12,2018-05-22 21:54:12,32.142385,-81.176194,0.402,GA Highway 21,L,Port Wentworth,...,False,False,False,False,Day,15:54:12,21:54:12,57252,78852,1


In [23]:
sorted_df.head(100).to_csv('Sample_data.csv')