**Import libraries and data**

In [None]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

#supress warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#import data
accident_data = pd.read_csv('../input/us-accidents/US_Accidents_Dec20_Updated.csv')

# EDA

In [None]:
#look at dtype
accident_data.info()

In [None]:
#print number and percentage of null entries per variable
print('Null values per variable')
for column in accident_data.columns:
    print('{}: {} ({}%)'.format(column,pd.isnull(accident_data[column]).sum(),(pd.isnull(accident_data[column]).sum()/len(accident_data))*100))

In [None]:
#look at distribution of data
accident_data.describe()

In [None]:
#look at formatting of entries
accident_data.head()

In [None]:
#looking to see ID format towards end
accident_data.tail()

In [None]:
#checking to see if the missing end lat/long are due having same start lat/long
accident_data[(accident_data.Start_Lat == accident_data.End_Lat) & (accident_data.Start_Lng == accident_data.End_Lng)]

In [None]:
#check to see if missing values are in same rows
accident_data[np.logical_xor(accident_data.End_Lat.isna(),accident_data.End_Lng.isna()) == True]

In [None]:
#look through some of the variables with low number of unique entries
#Side has ' '
#Wind direction has repeats with different spellings
#Weather condition has some repeats (e.g., 'heavy rain shower' and 'heavy rain showers')
for col in accident_data.columns:
    print('{}: {}'.format(col,accident_data[col].unique()))

In [None]:
#investigating Side value, will replace with mode
accident_data[accident_data.Side == ' ']

In [None]:
#create dataframe of variables collected at airport for airport_code with null values (i.e., no airport information)
weather_info = accident_data[accident_data.Airport_Code.isna()][['Airport_Code','Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)','Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition']]

In [None]:
#all records that are missing an airport_code are also missing weather information
print('Percentage null when Airport_Code is null:')
for weather in weather_info.columns:
    print('{}: {}%'.format(weather, (weather_info[weather].isna().count()/weather_info.Airport_Code.isna().count())*100))

In [None]:
del weather_info
gc.collect()

In [None]:
#null percipitation values are not necessarily due to no rain
accident_data[accident_data['Precipitation(in)'] == 0]

In [None]:
#looking for cases where Humidity is zero and Percipitation is null (i.e., precipitation should be set to zero)
accident_data[[a and b for a,b in zip(accident_data['Humidity(%)'] == 0,accident_data['Precipitation(in)'].isna())]][['Humidity(%)','Precipitation(in)']]

In [None]:
#looking for cases where wind speed is zero and wind direction is null (i.e., no wind to have a wind direction)
accident_data[[a and b for a,b in zip(accident_data['Wind_Speed(mph)'] == 0,accident_data['Wind_Direction'].isna())]][['Wind_Speed(mph)','Wind_Direction']]

In [None]:
#looking to see if wind_speed is zero, is wind_chill null
#for these records, null wind_chill entries should be replaced with temperature
accident_data[[a and b for a,b in zip(accident_data['Wind_Speed(mph)'] == 0,accident_data['Wind_Chill(F)'].isna())]][['Wind_Speed(mph)','Wind_Chill(F)']]

In [None]:
#looking to see if wind_speed is zero if there is a wind chill
accident_data[[a and not b for a,b in zip(accident_data['Wind_Speed(mph)'] == 0,accident_data['Wind_Chill(F)'].isna())]][['Wind_Speed(mph)','Wind_Chill(F)']]

In [None]:
#looking for null values in both wind speed and wind chill
accident_data[[a and b for a,b in zip(accident_data['Wind_Speed(mph)'].isna(),accident_data['Wind_Chill(F)'].isna())]][['Wind_Speed(mph)','Wind_Chill(F)']]

In [None]:
#parse dates and times
import datetime
accident_data['datetime_start_parsed'] = pd.to_datetime(accident_data['Start_Time'], format="%Y-%m-%d %H:%M:%S")
accident_data['datetime_end_parsed'] = pd.to_datetime(accident_data['End_Time'], format="%Y-%m-%d %H:%M:%S")
accident_data['datetime_weathertime_parsed'] = pd.to_datetime(accident_data['Weather_Timestamp'], format="%Y-%m-%d %H:%M:%S")

In [None]:
#for reference when categorizing based on numerical and categorical
accident_data.columns

In [None]:
#categorized variables based on numerical, categoric, and datetime
#description is excluded

numerical = [
       'Start_Lat', 'Start_Lng',
       'End_Lat', 'End_Lng', 'Distance(mi)','Number','Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)','Wind_Speed(mph)', 'Precipitation(in)'
]

categorical = [
    'Severity','Street', 'Side', 'City', 'Zipcode', 'County', 'State','Country','Timezone','Airport_Code','Wind_Direction','Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'
]

datetime = [
        'datetime_start_parsed', 'datetime_end_parsed', 'datetime_weathertime_parsed'
]

In [None]:
#since dataset is over 1 GB, taking a sample of 1% of data for analysis
accident_sample = accident_data.sample(int(len(accident_data)/100))
print('Percentage of data sampled: {}%'.format((len(accident_sample)/len(accident_data))*100))

In [None]:
#histograms for categorical data
for i in accident_sample[categorical].columns:
    plt.figure(figsize=(28, 6))
    sns.barplot(edgecolor='black',x=accident_sample[categorical][i].value_counts().index,y=accident_sample[categorical][i].value_counts())
    plt.xlabel(i)
    plt.ylabel('number of accidents')
    plt.show()

In [None]:
#re-plotting top 20 of difficult to see plots
cat_top10 = ['Street','City', 'Zipcode', 'County', 'Airport_Code', 'Weather_Condition']
for i in cat_top10:
    cat_grouped = accident_sample.groupby(i)['ID'].nunique().nlargest(20)
    plt.figure(figsize=(28, 6))
    sns.barplot(edgecolor='black',x=cat_grouped.index,y=cat_grouped)
    plt.xlabel(i)
    plt.ylabel('number of accidents')
    plt.show()

In [None]:
del cat_grouped
gc.collect()

In [None]:
#histograms for numerical data
for i in accident_sample[numerical].columns:
    plt.figure(figsize=(28, 6))
    plt.hist(accident_sample[numerical][i], edgecolor='black')
    plt.xticks()
    plt.xlabel(i)
    plt.ylabel('number of accidents')
    plt.show()

In [None]:
#histograms for day datetime data
for i in accident_sample[datetime].columns:
    plt.figure(figsize=(28, 6))
    plt.hist(accident_sample[datetime][i].dt.day, edgecolor='black')
    plt.xticks()
    plt.xlabel('{} day'.format(i))
    plt.ylabel('number of accidents')
    plt.show()

In [None]:
#histograms for month datetime data
for i in accident_sample[datetime].columns:
    plt.figure(figsize=(28, 6))
    plt.hist(accident_sample[datetime][i].dt.month, edgecolor='black')
    plt.xticks()
    plt.xlabel('{} month'.format(i))
    plt.ylabel('number of accidents')
    plt.show()

In [None]:
#histograms for year datetime data
for i in accident_sample[datetime].columns:
    plt.figure(figsize=(28, 6))
    plt.hist(accident_sample[datetime][i].dt.year, edgecolor='black', align='left', rwidth=0.5, bins=[2016, 2017,2018,2019,2020, 2021])
    plt.xlabel('{} year'.format(i))
    plt.xticks(ticks=[2016, 2017,2018,2019,2020, 2021])
    plt.ylabel('number of accidents')
    plt.show()

In [None]:
#histograms for hour datetime data
for i in accident_sample[datetime].columns:
    plt.figure(figsize=(28, 6))
    plt.hist(accident_sample[datetime][i].dt.hour, edgecolor='black')
    plt.xticks()
    plt.xlabel('{} hour'.format(i))
    plt.ylabel('number of accidents')
    plt.show()

In [None]:
#histograms for minute datetime data
for i in accident_sample[datetime].columns:
    plt.figure(figsize=(28, 6))
    plt.hist(accident_sample[datetime][i].dt.minute, edgecolor='black')
    plt.xticks()
    plt.xlabel('{} minute'.format(i))
    plt.ylabel('number of accidents')
    plt.show()

In [None]:
#histograms for second datetime data
for i in accident_sample[datetime].columns:
    plt.figure(figsize=(28, 6))
    plt.hist(accident_sample[datetime][i].dt.second, edgecolor='black')
    plt.xticks()
    plt.xlabel('{} second'.format(i))
    plt.ylabel('number of accidents')
    plt.show()

In [None]:
del accident_sample
gc.collect()

**Correlation heatmap**

In [None]:
#creating a dataframe for creating a correlation heatmap
numerical_heatmap = [
       'Start_Lat', 'Start_Lng',
       'End_Lat', 'End_Lng', 'Distance(mi)','Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)','Wind_Speed(mph)', 'Precipitation(in)'
]

accident_numerical_datetime = accident_data[numerical_heatmap]

#seperating dates and times into seperate columns
for col in ['datetime_start_parsed', 'datetime_end_parsed']:
    accident_numerical_datetime['{} day'.format(col)] = accident_data[col].dt.day
    accident_numerical_datetime['{} month'.format(col)] = accident_data[col].dt.month
    accident_numerical_datetime['{} year'.format(col)] = accident_data[col].dt.year
    accident_numerical_datetime['{} hour'.format(col)] = accident_data[col].dt.hour
    accident_numerical_datetime['{} minute'.format(col)] = accident_data[col].dt.minute
    accident_numerical_datetime['{} second'.format(col)] = accident_data[col].dt.second

In [None]:
#heat map to find extreme positive and negative correlations
plt.figure(figsize=(20, 20))
sns.heatmap(accident_numerical_datetime.corr(), annot=True)
plt.title('Correlation Heatmap for numerical Variables', fontdict={'fontsize':12}, pad=12);

In [None]:
#temperature and wind chill have a 0.99 correlation
#since this dataset is combined from two sources, it looks like each had different methods for recording temperature and wind chill
accident_sample = accident_data.sample(int(len(accident_data)/100))
sns.scatterplot(x='Wind_Chill(F)', y='Temperature(F)', data=accident_sample, palette='Set1')
plt.show()

In [None]:
del accident_sample
gc.collect()

In [None]:
del accident_numerical_datetime
gc.collect()

# GeoPandas Maps

In [None]:
#import geopandas and geoplot libraries
import geopandas
import geoplot as gplt
import geoplot.crs as gcrs

In [None]:
#create dataframe of variables I'm interested in looking at on a map plus lat/lng variables
severity_locations = accident_data[['Temperature(F)','Wind_Chill(F)','Severity','Start_Lng','Start_Lat']]

In [None]:
#create a GeoDataFrame
gdf_severity = geopandas.GeoDataFrame(
    severity_locations, geometry=geopandas.points_from_xy(severity_locations.Start_Lng, severity_locations.Start_Lat))

In [None]:
#generate map with hue based on severity
gdfs_sample = gdf_severity.sample(int(len(gdf_severity)/10))
print('{} accidents'.format(int(len(gdfs_sample))))

contiguous_usa = geopandas.read_file(gplt.datasets.get_path('contiguous_usa'))
ax = gplt.polyplot(
    contiguous_usa,
    projection=gcrs.AlbersEqualArea(),
    figsize=(20, 20)
)
gplt.pointplot(gdfs_sample, ax=ax, hue=gdfs_sample.Severity, scale=gdfs_sample.Severity, legend=True, legend_var='hue')
plt.show()

In [None]:
#generate map with hue based on temperature
gdfs_sample = gdf_severity.sample(int(len(gdf_severity)/10))
print('{} accidents'.format(int(len(gdfs_sample))))

contiguous_usa = geopandas.read_file(gplt.datasets.get_path('contiguous_usa'))
ax = gplt.polyplot(
    contiguous_usa,
    projection=gcrs.AlbersEqualArea(),
    figsize=(20, 20)
)
gplt.pointplot(gdfs_sample, ax=ax, hue=gdfs_sample['Temperature(F)'],scale=gdfs_sample['Temperature(F)'], legend=True, legend_var='hue')
plt.show()

In [None]:
#generate a map with hue based on wind chill
gdfs_sample = gdf_severity.sample(int(len(gdf_severity)/10))
print('{} accidents'.format(int(len(gdfs_sample))))

contiguous_usa = geopandas.read_file(gplt.datasets.get_path('contiguous_usa'))
ax = gplt.polyplot(
    contiguous_usa,
    projection=gcrs.AlbersEqualArea(),
    figsize=(20, 20)
)
gplt.pointplot(gdfs_sample, ax=ax, hue=gdfs_sample['Wind_Chill(F)'],scale=gdfs_sample['Wind_Chill(F)'], legend=True, legend_var='hue')
plt.show()

In [None]:
del gdf_severity
gc.collect()

In [None]:
del gdfs_sample
gc.collect()

In [None]:
del contiguous_usa
gc.collect()

# Data Cleaning and Feature Engineering

In [None]:
#replace space with most common entry
accident_data.Side = accident_data.Side.replace(' ','R')

In [None]:
#view sorted Weather_Conditions to determine changes to be made
weather_condition = accident_data.Weather_Condition.unique().astype('str')
weather_condition.sort()
weather_condition

In [None]:
del weather_condition
gc.collect()

In [None]:
#make changes to Weather_Condition since some entries are similar
accident_data.Weather_Condition = accident_data.Weather_Condition.replace('Thunder','Thunderstorm')
accident_data.Weather_Condition = accident_data.Weather_Condition.replace('T-Storm','Thunderstorm')

accident_data.Weather_Condition = accident_data.Weather_Condition.replace('T-Storm / Windy','Thunderstorm / Windy')
accident_data.Weather_Condition = accident_data.Weather_Condition.replace('Thunder / Windy','Thunderstorm / Windy')

accident_data.Weather_Condition = accident_data.Weather_Condition.replace('Heavy Rain Shower','Heavy Rain')
accident_data.Weather_Condition = accident_data.Weather_Condition.replace('Heavy Rain Showers','Heavy Rain')

accident_data.Weather_Condition = accident_data.Weather_Condition.replace('Light Rain Shower','Light Rain')
accident_data.Weather_Condition = accident_data.Weather_Condition.replace('Light Rain Showers','Light Rain')

accident_data.Weather_Condition = accident_data.Weather_Condition.replace('Rain Shower','Rain')
accident_data.Weather_Condition = accident_data.Weather_Condition.replace('Rain Showers','Rain')

In [None]:
#view unique values of Wind_Direction
accident_data.Wind_Direction.unique()

In [None]:
#replace entries in Wind_Direction to be consistent with notation
accident_data.Wind_Direction = accident_data.Wind_Direction.replace('West','W')
accident_data.Wind_Direction = accident_data.Wind_Direction.replace('Variable','VAR')
accident_data.Wind_Direction = accident_data.Wind_Direction.replace('South','S')
accident_data.Wind_Direction = accident_data.Wind_Direction.replace('Calm','CALM')
accident_data.Wind_Direction = accident_data.Wind_Direction.replace('East','E')
accident_data.Wind_Direction = accident_data.Wind_Direction.replace('North','N')

In [None]:
#make all zipcodes 5 digit                           
zipcodes = pd.DataFrame(accident_data[[not a for a in accident_data['Zipcode'].isna()]]['Zipcode'].str[:5])
zipindex = np.array(accident_data[accident_data.Zipcode.notnull()].index)
accident_data.loc[zipindex,'Zipcode'] = zipcodes.loc[:,'Zipcode']

In [None]:
#remove start/end/weather time (already parsed)  and remove Number which has 65% null values
accident_data = accident_data.drop(['ID','Start_Time','End_Time', 'Weather_Timestamp','Number'], axis=1)

In [None]:
#remove Number from numerical array
numerical.remove('Number')

In [None]:
#N/A Precipitation must mean virtually no rain
accident_data[accident_data.Weather_Condition == 'N/A Precipitation']['Precipitation(in)'].mean()

In [None]:
#replace N/A Precipitation with NaN to be imputed with most frequent
accident_data.Weather_Condition = accident_data.Weather_Condition.replace('N/A Precipitation', np.NaN)

In [None]:
#determine index then lat/lng for missing city, zipcode, and timezone data
citynullindex = np.array(accident_data[accident_data.City.isna()].index)
citynulllatlng = accident_data.loc[citynullindex, ['Start_Lat','Start_Lng']]

zipnullindex = np.array(accident_data[accident_data.Zipcode.isna()].index)
zipnulllatlng = accident_data.loc[zipnullindex, ['Start_Lat','Start_Lng']]

acnullindex = np.array(accident_data[accident_data.Airport_Code.isna()].index)

tznullindex = np.array(accident_data[accident_data.Timezone.isna()].index)
tznulllatlng = accident_data.loc[tznullindex, ['Start_Lat','Start_Lng']]

In [None]:
#initialize Nominatim for finding missing cities and zipcodes based on lat/lng
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="geoapiExercises")

In [None]:
#fill in missing cities
for i in citynullindex:
    location = geolocator.reverse(citynulllatlng.loc[i,'Start_Lat'].astype('str')+","+citynulllatlng.loc[i,'Start_Lng'].astype('str'))
    address = location.raw['address']
    city = address.get('city', '')
    accident_data.loc[i,'City'] = city
    
#fill in missing zipcodes
for i in zipnullindex:
    location = geolocator.reverse(zipnulllatlng.loc[i,'Start_Lat'].astype('str')+","+zipnulllatlng.loc[i,'Start_Lng'].astype('str'))
    address = location.raw['address']
    zipcode = address.get('postcode')
    accident_data.loc[i,'Zipcode'] = zipcode
    #if cannot locate zipcode with geolocator, fill in zipcode mode for the state
    if zipcode == None:
        accident_data.loc[i,'Zipcode'] = accident_data[accident_data['State'] == accident_data.loc[i,'State']]['Zipcode'].mode(dropna=True)[0]

In [None]:
#replace missing airport_code data with most common airport_code for each state
ac_states = pd.DataFrame(accident_data.loc[acnullindex,'State'].unique(), columns=['State'])
for i in range(0,len(ac_states)):
    ac_states.loc[i,'Mode'] = accident_data[accident_data.State == ac_states.loc[i,'State']]['Airport_Code'].mode(dropna=True)[0]
for i in acnullindex:
    accident_data.loc[i,'Airport_Code'] = ac_states[ac_states['State'] == accident_data.loc[i,'State']]['Mode'].tolist()[0]

In [None]:
#install timezonefinder
!pip install timezonefinder

In [None]:
#fill in missing time zones
from timezonefinder import TimezoneFinder
obj = TimezoneFinder()
for i in tznullindex:
    timezone = obj.timezone_at(lng=tznulllatlng.loc[i,'Start_Lng'], lat=tznulllatlng.loc[i,'Start_Lat'])
    accident_data.loc[i,'Timezone'] = timezone

In [None]:
#copy values from start lat/lng to end lat/lng for null values
latnull = np.array(accident_data[np.isnan(accident_data.End_Lat)].index)
accident_data.loc[latnull,'End_Lat'] = accident_data.loc[latnull,'Start_Lat']
lngnull = np.array(accident_data[np.isnan(accident_data.End_Lng)].index)
accident_data.loc[lngnull,'End_Lng'] = accident_data.loc[lngnull,'Start_Lng']

#replace missing wind chill data with temperature
wcnull = np.array(accident_data[np.isnan(accident_data['Wind_Chill(F)'])].index)
accident_data.loc[wcnull,'Wind_Chill(F)'] = accident_data.loc[i,'Temperature(F)']

#replace missing temperature data with wind chill
tempnull = np.array(accident_data[np.isnan(accident_data['Temperature(F)'])].index)
accident_data.loc[wcnull,'Temperature(F)'] = accident_data.loc[i,'Wind_Chill(F)']

#replace missing data for when the weather was collected based on end time of the accident
weathertimenull = np.array(accident_data[np.isnan(accident_data.datetime_weathertime_parsed)].index)
accident_data.loc[weathertimenull,'datetime_weathertime_parsed'] = accident_data.loc[weathertimenull,'datetime_end_parsed']

In [None]:
#variables to impute with median or mode strategy
median_impute = ['Temperature(F)','Wind_Chill(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)','Precipitation(in)']
mode_impute = ['Wind_Direction','Weather_Condition','Sunrise_Sunset','Civil_Twilight', 'Nautical_Twilight','Astronomical_Twilight']

In [None]:
#mode impute (SimpleImputer for loop is too slow)
wdnull = np.array(accident_data[accident_data.Wind_Direction.isna()].index)
accident_data.loc[wdnull,'Wind_Direction'] = 'CALM'

wcnull = np.array(accident_data[accident_data.Weather_Condition.isna()].index)
accident_data.loc[wcnull,'Weather_Condition'] = 'Fair'

ssnull = np.array(accident_data[accident_data.Sunrise_Sunset.isna()].index)
accident_data.loc[ssnull,'Sunrise_Sunset'] = 'Day'

ctnull = np.array(accident_data[accident_data.Civil_Twilight.isna()].index)
accident_data.loc[ssnull,'Civil_Twilight'] = 'Day'

ntnull = np.array(accident_data[accident_data.Nautical_Twilight.isna()].index)
accident_data.loc[ssnull,'Nautical_Twilight'] = 'Day'

atnull = np.array(accident_data[accident_data.Astronomical_Twilight.isna()].index)
accident_data.loc[atnull,'Astronomical_Twilight'] = 'Day'

In [None]:
#impute variables with median values
import numpy as np
from sklearn.impute import SimpleImputer

imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
accident_data_median_fit = imputer1.fit_transform(accident_data[median_impute])
accident_data_median = imputer1.transform(accident_data_median_fit)

accident_data[median_impute] = pd.DataFrame(accident_data_median)

In [None]:
#print number and percentage of null entries per variable
print('Null values per variable')
for column in accident_data.columns:
    print('{}: {} ({}%)'.format(column,pd.isnull(accident_data[column]).sum(),(pd.isnull(accident_data[column]).sum()/len(accident_data))*100))

In [None]:
#seperate variables with dates and times into seperate columns
for col in datetime:
    accident_data['{} day'.format(col)] = accident_data[col].dt.day
    accident_data['{} month'.format(col)] = accident_data[col].dt.month
    accident_data['{} year'.format(col)] = accident_data[col].dt.year
    accident_data['{} hour'.format(col)] = accident_data[col].dt.hour
    accident_data['{} minute'.format(col)] = accident_data[col].dt.minute
    accident_data['{} second'.format(col)] = accident_data[col].dt.second
    numerical.append('{} day'.format(col))
    numerical.append('{} month'.format(col))
    numerical.append('{} year'.format(col))
    numerical.append('{} hour'.format(col))
    numerical.append('{} minute'.format(col))
    numerical.append('{} second'.format(col))

#remove datetime columns from dataframe     
accident_data = accident_data.drop(datetime, axis=1)

# PCA

In [None]:
#sample dataframe
accident_sample = accident_data[numerical].sample(int(len(accident_data)/100))

In [None]:
#standardize sample
accident_data_for_PCA_standardized = (accident_sample - accident_sample.mean(axis=0)) / accident_sample.std(axis=0)

from sklearn.decomposition import PCA

# Create principal components
pca = PCA(6)
accident_data_pca = pca.fit_transform(accident_data_for_PCA_standardized)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(accident_data_pca.shape[1])]
accident_data_pca = pd.DataFrame(accident_data_pca, columns=component_names)

accident_data_pca.head()

In [None]:
loadings = pd.DataFrame(
    pca.components_.T,  # transpose the matrix of loadings
    columns=component_names,  # so the columns are the principal components
    index=numerical,  # and the rows are the original features
)
loadings

#PC1: year
#PC2: hour
#PC3: month
#PC4: day
#PC5: latitude
#PC6: longitude

In [None]:
def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs

plot_variance(pca);

# Feature Engineering

In [None]:
#create variables for the change in lat and lng for each accident
accident_data['abs_lng_change'] = abs(accident_data.End_Lng - accident_data.Start_Lng)
accident_data['abs_lat_change'] = abs(accident_data.End_Lat - accident_data.Start_Lat)

#add new variables to numerical variables list
numerical.append('abs_lng_change')
numerical.append('abs_lat_change')