In [1]:
# Import dependencies
import pandas as pd

# Data Extraction

In [3]:
# Extract Kaggle Data

file_dir = 'C:/Users/ruchi/Desktop/Berkley Extension Learning Docs/Final Project'

kaggle_metadata = pd.read_csv(f'{file_dir}/US_Accidents_Dec19.tar.gz', compression='gzip', error_bad_lines=False, low_memory=False)


In [4]:
### Turn off SettingWithCopyWarning ##############
pd.options.mode.chained_assignment = None

In [5]:
# Check total rows extraced
len(kaggle_metadata)

2974336

In [6]:
# Extract only Columns relevant for analysis
df_subset = kaggle_metadata[['US_Accidents_Dec19.csv','Severity','Start_Time','End_Time',
                             'Start_Lat','Start_Lng','Distance(mi)', 'Street','Side','City',
                             'County','State','Zipcode','Timezone', 
                             'Temperature(F)','Humidity(%)','Pressure(in)',
                             'Visibility(mi)','Wind_Direction','Wind_Speed(mph)','Precipitation(in)',
                             'Weather_Condition','Amenity','Crossing','Junction','Railway',
                             'Station','Stop','Traffic_Signal','Civil_Twilight'
                            ]]

# Transform

In [7]:
# Check Null Values
df_subset.isnull().sum()

US_Accidents_Dec19.csv          1
Severity                        1
Start_Time                      1
End_Time                        1
Start_Lat                       1
Start_Lng                       1
Distance(mi)                    1
Street                          1
Side                            1
City                           84
County                          1
State                           1
Zipcode                       881
Timezone                     3164
Temperature(F)              56064
Humidity(%)                 59174
Pressure(in)                48143
Visibility(mi)              65692
Wind_Direction              45102
Wind_Speed(mph)            440841
Precipitation(in)         1998359
Weather_Condition           65933
Amenity                         1
Crossing                        1
Junction                        1
Railway                         1
Station                         1
Stop                            1
Traffic_Signal                  1
Civil_Twilight

In [8]:
# Fill NA with zero values for Precipitation column
df_subset["Precipitation(in)"].fillna(0, inplace = True) 

In [9]:
# Drop rows with other NA values
df_subset.dropna(inplace=True)

In [10]:
# Check the resulting dataset length
len(df_subset)

2506618

In [11]:
# Sort the dataframe on Severity so that when removing duplicates the one with higher severity is retained
sorted_df = df_subset.sort_values('Severity',ascending=False)

In [12]:
# Check how many duplicates exist in the dataset
len(sorted_df[['Severity', 'Start_Time', 'Start_Lat', 'Start_Lng']].drop_duplicates())

2494218

In [13]:
# Remove duplicates
sorted_df.drop_duplicates(subset=['Severity', 'Start_Time', 'Start_Lat', 'Start_Lng'], inplace = True) 

In [14]:
# Extract first 5 digits of zipcode where zip code is in postal format of ZIP-4
sorted_df['Zipcode'] = sorted_df['Zipcode'].str.replace(r"-.*","")

In [15]:
# Check length of the remaining dataset after removing duplicate and Null value rows
len(sorted_df)

2494218

In [16]:
# check datatypes for corrections
sorted_df.dtypes

US_Accidents_Dec19.csv     object
Severity                  float64
Start_Time                 object
End_Time                   object
Start_Lat                 float64
Start_Lng                 float64
Distance(mi)              float64
Street                     object
Side                       object
City                       object
County                     object
State                      object
Zipcode                    object
Timezone                   object
Temperature(F)            float64
Humidity(%)               float64
Pressure(in)              float64
Visibility(mi)            float64
Wind_Direction             object
Wind_Speed(mph)           float64
Precipitation(in)         float64
Weather_Condition          object
Amenity                    object
Crossing                   object
Junction                   object
Railway                    object
Station                    object
Stop                       object
Traffic_Signal             object
Civil_Twilight

In [17]:
sorted_df['Start_Time'] = pd.to_datetime(sorted_df.Start_Time)
sorted_df['End_Time'] = pd.to_datetime(sorted_df.End_Time)

In [18]:
#########################   Create Highway Column   ####################################


searchfor = ['highway', 'Tollway', 'expy', 'fwy', 'hwy', 'Interstate', 
             'Tpke', 'Pkwy', 'Parkway', '-', 'US', 'Route', 
             'FM', 'Byp', 'Trwy', 'Beltway', 'Skyway', 'Skwy', ]
sorted_df.loc[sorted_df['Street'].str.contains('|'.join(searchfor), case=False), 'Highway'] = 'Y'

# Fill NA with zero values for Precipitation column
sorted_df["Highway"].fillna('N', inplace = True) 


In [19]:
sorted_df[['Start_Lat','Start_Lng']][sorted_df['Highway'] == 'N']

Unnamed: 0,Start_Lat,Start_Lng
2584250,48.964400,-122.441456
2455305,47.701420,-122.344660
2888675,42.552910,-84.797750
2888674,42.552100,-84.789336
2455314,36.734570,-120.199785
...,...,...
283448,29.735319,-95.460625
140799,27.878984,-82.658920
679691,32.667721,-97.204468
1970482,33.480442,-111.889946


In [20]:
# Create Coordinates column
sorted_df['Coordinates'] = sorted_df['Start_Lat'].map(str) + ', ' + sorted_df['Start_Lng'].map(str)

In [21]:
# Rename columns and reset index
sorted_df = sorted_df.rename(index=str,columns={'US_Accidents_Dec19.csv': 'Accident_ID'})

In [22]:
# Create Dataframes for Loading into SQL Tables 

table1_df = sorted_df[['Accident_ID','Severity','Start_Time','End_Time',
                             'Start_Lat','Start_Lng','Coordinates', 'Distance(mi)', 'Side', 
                             'Temperature(F)','Humidity(%)','Pressure(in)',
                             'Visibility(mi)','Wind_Direction','Wind_Speed(mph)','Precipitation(in)',
                             'Weather_Condition','Amenity','Crossing','Junction','Railway',
                             'Station','Stop','Traffic_Signal','Civil_Twilight'
                             ]]
table2_df = sorted_df[['Coordinates', 'Street','City','County','State','Zipcode',
                       'Timezone', 'Highway']]

In [23]:
table2_df.drop_duplicates(inplace=True)

In [24]:
# Set Index
table1_df.set_index('Accident_ID', inplace=True)
table2_df.set_index('Coordinates', inplace=True)

In [None]:
table2_df.dtypes.to_csv('tab2.csv')

#### Split Date and Time Column into Date, Time, Time in Seconds, and Day of week columns

In [None]:
# new data frame with split value Start date time column
newstart = sorted_df["Start_Time"].str.split(" ",expand = True) 
  
# making separate Start Time column from new data frame 
sorted_df["Start_Time_of_Day"]= newstart[1] 

In [None]:
# new data frame with split value End date time column
newend = sorted_df["End_Time"].str.split(" ",expand = True) 

# making separate Start Time column from new data frame 
sorted_df["End_Time_of_Day"]= newend[1] 

In [None]:
# Convert Time to seconds for Start Time and End Time
sorted_df['Start_seconds'] = pd.to_timedelta(sorted_df['Start_Time_of_Day']).dt.seconds

# Convert Time to seconds for Start Time and End Time
sorted_df['End_seconds'] = pd.to_timedelta(sorted_df['End_Time_of_Day']).dt.seconds

In [None]:
sorted_df['Start_Time'] = pd.to_datetime(sorted_df.Start_Time)
sorted_df['End_Time'] = pd.to_datetime(sorted_df.End_Time)

In [None]:
# Get Day of the week for the accident
sorted_df['Day_of_Week'] = sorted_df['Start_Time'].dt.weekday

In [None]:
sorted_df.head(5)

In [None]:
sorted_df.dtypes

In [None]:
sorted_df[sorted_df['US_Accidents_Dec19.csv'] == 'A-2782717']

In [None]:
sorted_df.head(100).to_csv('Sample_data.csv')