# Visualising the collision dataset

Using geospatial analysis to look for interesting trends in the dataset.
This is certainly a work in progress, feel free to comment or any pointers (particualrly with the folium Choropleth maps.. I still haven't figured those out!) :) 

Enjoy.

In [None]:
!pip install chart_studio


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sqlite3

import os

import geopandas as gpd # not used?
from geopandas import GeoDataFrame
from shapely.geometry import Point

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


import folium
from folium import Choropleth
from folium.plugins import HeatMap

from mpl_toolkits.basemap import Basemap # plotting maps

#plotly graphing
# plotly
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

import fiona
from shapely.geometry import MultiPoint, Point, Polygon,shape
from shapely.geometry.polygon import Polygon


In [None]:
pd.set_option('display.max_rows', 1500)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

In [None]:
us_countys = pd.read_csv('../input/covid19-us-county-jhu-data-demographics/us_county.csv')

In [None]:
county_demographics_ca = us_countys[us_countys.state == "California"]
county_demographics_ca.head()

In [None]:
# Create a SQL connection to our SQLite database
con = sqlite3.connect("/kaggle/input/california-traffic-collision-data-from-switrs/switrs.sqlite")


## The Datasaet
There are three main tables:

* collisions: Contains information about the collision, where it happened, what vehicles were involved.
* parties: Contains information about the groups people involved in the collision including age, sex, and sobriety.
* victims: Contains information about the injuries of specific people involved in the collision.

There is also a table called case_ids which @Alexgude used to build the other tables. It tells you which of the four original datasets each row came from.

https://tims.berkeley.edu/help/SWITRS.php The data dictionary

In [None]:
case_ids = pd.read_sql_query("SELECT * from case_ids LIMIT 5", con)
print('Case IDS\n')
print(case_ids.columns.values)
case_ids.head()

In [None]:
collision_count = pd.read_sql_query("SELECT COUNT() from collisions LIMIT 1", con)
print('Count of collions\n')
collision_count.head()

There are just over 9 million collision records

In [None]:
collisions = pd.read_sql_query("SELECT * from collisions LIMIT 5", con)
print('Collisions\n')
print(collisions.columns.values)
collisions.head()

In [None]:
parties = pd.read_sql_query("SELECT * from parties LIMIT 5", con)
print('Parties\n')
print(parties.columns.values)
parties.head()

In [None]:
victims = pd.read_sql_query("SELECT * from victims LIMIT 5", con)
print('Victims\n')
print(victims.columns.values)
victims.head()

## Geospatial overview

In [None]:
def embed_map(m, file_name):
    from IPython.display import IFrame
    m.save(file_name)
    return IFrame(file_name, width='100%', height='500px')

In [None]:
query = (
    "SELECT latitude, longitude, jurisdiction "
    "FROM collisions "
    "WHERE latitude IS NOT NULL AND longitude IS NOT NULL AND jurisdiction IS NOT NULL "
    "GROUP BY jurisdiction"
)

    
# Construct a Dataframe from the results
juristictions = pd.read_sql_query(query, con)

In [None]:
#The sample of 10,000 points shows that the data including land and water collions.
"""query = (
    "SELECT latitude, longitude "
    "FROM collisions "
    "WHERE case_id IN (SELECT case_id FROM collisions WHERE latitude IS NOT NULL AND longitude IS NOT NULL ORDER BY RANDOM() LIMIT 10000)"
)

    
    # Construct a Dataframe from the results
incident_locations_sample = pd.read_sql_query(query, con)"""

In [None]:
juristictions.head()


In [None]:
# Save the data as csv files  - from sstewart0
juristictions.to_csv('juristictions.csv',index=False)

In [None]:
# We will change dtype when we need it, multiple types are memory inefficient
#juristictions = pd.read_csv('juristictions.csv',dtype=str)
juristictions = pd.read_csv('juristictions.csv')
juristictions.head()

Before plotting anything, we also want to have some understanding of the where the incidents in the jurastrictions are so we have incuded a county dataset.

In [None]:
county_boundaries = gpd.read_file("../input/california-counties/CA_Counties_TIGER2016.shp")
county_boundaries.set_index('NAME', inplace=True)
county_boundaries['GEOID'] = [s.lstrip("0") for s in county_boundaries['GEOID']]
county_boundaries = county_boundaries.rename(columns={"GEOID": "fips"})
county_boundaries["fips"] = county_boundaries["fips"].astype(str).astype(int)
county_boundaries.head()

In [None]:
county_boundaries = county_boundaries.join(county_demographics_ca.set_index('fips'), on='fips')

In [None]:
county_boundaries.head()

In [None]:
county_boundaries = county_boundaries.to_crs(epsg=4326)

In [None]:
county_boundaries['COUNTYFP_KEY'] = county_boundaries['COUNTYFP']
counties = county_boundaries[["COUNTYFP", "geometry","COUNTYFP_KEY","fips"]].set_index("COUNTYFP")
#counties = county_boundaries[["COUNTYFP", "geometry"]]


In [None]:
counties.head()

In [None]:
## county_boundaries.plot(edgecolor='black')

In [None]:
county_boundaries["center"] = county_boundaries["geometry"].centroid
za_points = county_boundaries.copy()
za_points.set_geometry("center", inplace = True)

In [None]:
ax = county_boundaries.plot(figsize = (15, 12), color = "whitesmoke", edgecolor = "lightgrey", linewidth = 0.5)
texts = []

for x, y, label in zip(za_points.geometry.x, za_points.geometry.y, county_boundaries["NAMELSAD"]):
    texts.append(plt.text(x, y, label, fontsize = 8))



In [None]:
#Find the county for the Juristiction 
rows = []
for i, juristiction in juristictions.iterrows():
    for j, geometry in county_boundaries.iterrows():
        point = Point(juristiction.longitude,juristiction.latitude)
        if point.within(shape(geometry['geometry'])):
            #create panda row with juristiction code and county name and lat long that suggested that county
            rows.append([juristiction.jurisdiction,geometry.NAMELSAD,geometry.COUNTYFP,juristiction.latitude,juristiction.longitude])
            break # next juristiction

            
juristiction_ids = pd.DataFrame(rows, columns=["juristiction", "NAMELSAD","COUNTYFP","lat","long"])

In [None]:
juristiction_ids.head()

In [None]:
juristiction_ids.groupby('NAMELSAD').juristiction.count()

In [None]:
query = (
    "SELECT latitude, longitude, jurisdiction "
    "FROM collisions "
    "WHERE latitude IS NOT NULL AND longitude IS NOT NULL AND jurisdiction IS NOT NULL "
)

    
# Construct a Dataframe from the results
incident_locations_all = pd.read_sql_query(query, con)

In [None]:
incident_locations_all.to_csv('incident_locations_all.csv',index=False)

In [None]:
incident_locations_all = pd.read_csv('incident_locations_all.csv')
incident_locations_all.head()

In [None]:
# Add County to incident_locations_all
def add_county(juristiction_ids, df_to_modify):
    for i, juristiction_id in juristiction_ids.iterrows():
        #print(juristiction_id['juristiction'])
        df_to_modify.loc[df_to_modify['jurisdiction'] == juristiction_id['juristiction'], 'NAMELSAD'] = juristiction_id['NAMELSAD'] 
        df_to_modify.loc[df_to_modify['jurisdiction'] == juristiction_id['juristiction'], 'COUNTYFP'] = juristiction_id['COUNTYFP'] 
        
    return df_to_modify

In [None]:
incident_locations_all = add_county(juristiction_ids, incident_locations_all)

In [None]:
#incident_locations_all = incident_locations_all.set_index('COUNTYFP')
incident_locations_all.head()

In [None]:
# From @alexgude good work https://www.kaggle.com/alexgude/starter-california-traffic-collisions-from-switrs
fig = plt.figure(figsize=(20,20))

basemap = Basemap(
    projection='gall',
    llcrnrlon = -126,   # lower-left corner longitude
    llcrnrlat = 32,     # lower-left corner latitude
    urcrnrlon = -113,   # upper-right corner longitude
    urcrnrlat = 43,     # upper-right corner latitude
)

x, y = basemap(incident_locations_all['longitude'].values, incident_locations_all['latitude'].values)

basemap.plot(x, y, 'k.', markersize=1.5)

fig.show()

In [None]:
plot_dict = incident_locations_all.groupby('COUNTYFP').NAMELSAD.count()

In [None]:
plot_dict.head()

In [None]:
#plot_dict = incident_locations_all.groupby(['COUNTYFP','NAMELSAD']).count().reset_index()
#plot_dict = plot_dict.drop(columns=['latitude', 'longitude'])
#plot_dict = plot_dict.rename(columns={"jurisdiction": "Count"})
#plot_dict = plot_dict.set_index('COUNTYFP')
#plot_dict.head()

In [None]:
plot_all_crash = incident_locations_all.groupby(['COUNTYFP','NAMELSAD']).count().reset_index()
plot_all_crash['COUNTYFP_KEY'] = plot_all_crash['COUNTYFP']
plot_all_crash = plot_all_crash.drop(columns=['latitude', 'longitude'])
plot_all_crash = plot_all_crash.rename(columns={"jurisdiction": "Count"})
plot_all_crash = plot_all_crash.set_index('COUNTYFP')
plot_all_crash = plot_all_crash.sort_values(by='Count',ascending=False)
plot_all_crash.head()

In [None]:
plot_all_crash = plot_all_crash.join(county_boundaries.set_index('COUNTYFP'), on='COUNTYFP', lsuffix='_plot')

In [None]:
plot_all_crash.head()

In [None]:
#plot_all_crash = plot_all_crash.set_index('NAMELSAD_plot')

In [None]:
#incident per 100 people
plot_all_crash['incident_per_pop'] = (plot_all_crash['Count'] / plot_all_crash['population'])*100

The crash data sorted by the incidents per population

In [None]:
plot_all_crash = plot_all_crash.sort_values(by='incident_per_pop',ascending=False)
plot_all_crash.head()

Mono County has the highest coolision by population - 35 collissions for every 100 residents over this time period.

What type of accidents happen there?  Where are they happening?

## Mono County

In [None]:
incidents_mono_county = incident_locations_all[incident_locations_all.NAMELSAD == "Mono County"]
incidents_mono_county.head()

In [None]:
jurist_mono_county = incidents_mono_county.groupby('jurisdiction').count()
jurist_mono_county

In [None]:
#    "SELECT collisions.case_id,  "
#    "primary_collision_factor, "
#    "type_of_collision, "  
#    "alcohol_involved,"
#    "latitude,"
#    "longitude, collision_date, collision_time,  "
#    "party_type, party_sex,"
#    "party_age, "
#    "vehicle_year, vehicle_make,  "
#    "victim_sex, victim_age"
#    "FROM collisions "
#    "LEFT JOIN parties ON collisions.case_id "
#    "LEFT JOIN victims ON collisions.case_id AND parties.party_number = victims.party_number "
#    "WHERE latitude IS NOT NULL AND longitude IS NOT NULL AND jurisdiction IN ('2601','9246','9820') "

In [None]:
# Too big..  reduce..
# Select from collisions join party and victims where juristicion from three above
query = (
    "SELECT collisions.case_id,  "
    "latitude, "
    "longitude "
    "FROM collisions "
    "WHERE latitude IS NOT NULL AND longitude IS NOT NULL AND jurisdiction IN ('2601','9246','9820') "
    
)
# Construct a Dataframe from the results
incident_full_mono_county = pd.read_sql_query(query, con)

In [None]:
incident_full_mono_county.to_csv('incident_full_mono_county.csv',index=False)

In [None]:
incident_full_mono_county = pd.read_csv('incident_full_mono_county.csv')
incident_full_mono_county.head()

In [None]:
# Create heat map of incidents in Mono County

mono_1 = folium.Map(location=[38,-119], zoom_start=8)
HeatMap(data=incident_full_mono_county[['latitude', 'longitude']], radius=12).add_to(mono_1)
tiles = ['openstreetmap',  'cartodbpositron', 'stamenterrain']
for tile in tiles:
    folium.TileLayer(tile).add_to(mono_1)

folium.LayerControl().add_to(mono_1)

# Show the map
mono_1

Most of the incidents happen on the main road 395 that run south west accross the county.  There are a number of bends that appear as hot spots.
There are hot spots around the airports and towns.
There are a few hotspots that are a little more unusual.
Such as the mountainous region around Lake Alpine.  Many of the roads around this area are not sealed and that are some steep inclines.
Near the town of Bishop the incident points appear off the road.  A possilbe reason for this that would need to be investigated is that the North Sierra Highway has replaced old roads.





***I am struggling to get the county name as a label..  I think the trick is somewhere around key_on and knowing the right geojason and joining it with the data from plot_dict2***

In [None]:
map1 = folium.Map([37,-119], tiles='openstreetmap', zoom_start=6)
tiles = ['stamenwatercolor', 'cartodbpositron', 'openstreetmap', 'stamenterrain']
for tile in tiles:
    folium.TileLayer(tile).add_to(map1)
    
choropleth = folium.Choropleth(
    geo_data = counties.__geo_interface__,
    name = 'choropleth',
    data = plot_dict,
    #data = plot_all_crash,
    key_on="feature.id",
    #key_on="feature.properties.COUNTYFP_KEY",
    #columns = ['NAMELSAD', 'Count'],
    fill_color = 'YlGn',
    fill_opacity = 0.7,
    line_opacity = 0.2,
    legend_name = 'Collisions by county',
    highlight = True
).add_to(map1)
folium.LayerControl().add_to(map1)
# Display Label
# Display Region Label

map1

In [None]:
plot_all_crash.head()

In [None]:
plot_dict = plot_all_crash['incident_per_pop']

In [None]:
plot_dict

In [None]:
map1a = folium.Map([37,-119], tiles='openstreetmap', zoom_start=6)
tiles = ['stamenwatercolor', 'cartodbpositron', 'openstreetmap', 'stamenterrain']
for tile in tiles:
    folium.TileLayer(tile).add_to(map1a)
    
choropleth = folium.Choropleth(
    geo_data = counties.__geo_interface__,
    #geo_data = plot_all_crash.geometry,
    name = 'choropleth',
    data = plot_dict,
    #data = plot_all_crash,
    key_on="feature.id",
    #key_on="feature.properties.COUNTYFP_KEY",
    #columns = ['NAMELSAD', 'incident_per_pop'],
    fill_color = 'YlGn',
    fill_opacity = 0.7,
    line_opacity = 0.2,
    legend_name = 'Collisions by county per 100 people',
    highlight = True
).add_to(map1a)
folium.LayerControl().add_to(map1a)
# Display Label
# Display Region Label

map1a

In [None]:
counties = counties.geometry.to_crs(epsg=4326)

##  Attempting to get the attribute data into the map with a single panda file...

In [None]:
incidents_by_county = incident_locations_all.groupby(['COUNTYFP','NAMELSAD']).count().reset_index()
incidents_by_county['COUNTYFP_KEY'] = incidents_by_county['COUNTYFP']
incidents_by_county = incidents_by_county.drop(columns=['latitude', 'longitude'])
incidents_by_county = incidents_by_county.rename(columns={"jurisdiction": "Count"})
incidents_by_county = incidents_by_county.set_index('COUNTYFP')

In [None]:
counties = county_boundaries[["COUNTYFP", "geometry","COUNTYFP_KEY","fips"]].set_index("COUNTYFP")
counties.head()

In [None]:
incidents_by_county.head()

In [None]:
counties_data = counties.join(incidents_by_county, lsuffix='_plot')
counties_data.head()

In [None]:
counties_data['Count'] = counties_data.Count.fillna("0")
counties_data['NAMELSAD'] = counties_data.NAMELSAD.fillna("--")

In [None]:
counties_data.head()

In [None]:
map1b = folium.Map([37,-119], tiles='openstreetmap', zoom_start=6)
tiles = ['stamenwatercolor', 'cartodbpositron', 'openstreetmap', 'stamenterrain']
for tile in tiles:
    folium.TileLayer(tile).add_to(map1b)
    
choropleth = folium.Choropleth(
    geo_data = counties_data.__geo_interface__,
    #geo_data = plot_all_crash.geometry,
    name = 'choropleth',
    #data = plot_dict,
    data = counties_data,
    #key_on="feature.id",
    #key_on="feature.properties.COUNTYFP_KEY",
    columns = ['NAMELSAD', 'Count'],
    fill_color = 'YlGn',
    fill_opacity = 0.7,
    line_opacity = 0.2,
    legend_name = 'Testing label on single dataframe',
    highlight = True
).add_to(map1b)
folium.LayerControl().add_to(map1b)
# Display Label
# Display Region Label

map1b

In [None]:
# Create heat map
m_2 = folium.Map(location=[37,-119], zoom_start=6)
HeatMap(data=incident_locations_all[['latitude', 'longitude']], radius=10).add_to(m_2)


# Show the map
m_2

## Collision table
What was the primary reason for the collision

**TODO - Bring in population data**
Look at incidents by population
demographics of collision types - possibly compare with demographics of county of interest

For other types of violation look at lat long (alseep doesn't have this)

And of course try to add the labels to the maps!

In [None]:
collisions = pd.read_sql_query("SELECT  COUNT(*) AS collisions, primary_collision_factor from collisions  GROUP BY primary_collision_factor ORDER BY collisions", con)



In [None]:
# Bar chart showing frquency of collision factors
plt.figure(figsize=(10,6))

# Add title
plt.title("Primary factor for collision")

plt.ticklabel_format(style='plain', axis='y')
sns.barplot(x=collisions['primary_collision_factor'], y=collisions['collisions'] )


# Add label for vertical axis
plt.ylabel("Frequency")
plt.xlabel("Primary factor")


### Drivers that fell asleep
Where and when did this happen?

In [None]:
collisions_asleep = pd.read_sql_query("SELECT * from collisions WHERE primary_collision_factor = 'fell asleep'  ORDER BY collision_date", con)


In [None]:
collisions_asleep = add_county(juristiction_ids, collisions_asleep)
collisions_asleep

In [None]:
plot_dict = collisions_asleep.groupby('COUNTYFP').NAMELSAD.count()

In [None]:
map1 = folium.Map([37,-119], tiles='openstreetmap', zoom_start=6)
tiles = ['stamenwatercolor', 'cartodbpositron', 'openstreetmap', 'stamenterrain']
for tile in tiles:
    folium.TileLayer(tile).add_to(map1)
    
choropleth = folium.Choropleth(
    geo_data = counties.__geo_interface__,
    name = 'choropleth',
    data = plot_dict,
    key_on="feature.id",
    #key_on="feature.properties.COUNTYFP_KEY",
    #columns = ['NAMELSAD', 'Count'],
    fill_color = 'YlGn',
    fill_opacity = 0.7,
    line_opacity = 0.2,
    legend_name = 'Collisions by county',
    highlight = True
).add_to(map1)
folium.LayerControl().add_to(map1)
# Display Label
# Display Region Label

map1

In [None]:
violation_category = pd.read_sql_query("SELECT COUNT(*) AS collisions, pcf_violation_category from collisions WHERE primary_collision_factor = 'vehicle code violation' GROUP BY pcf_violation_category ORDER BY collisions", con)



In [None]:
# Bar chart 
plt.figure(figsize=(10,6))

# Add title
plt.title("Primary factor for collision  - Violations")

plt.ticklabel_format(style='plain', axis='y')
sns.barplot(x=violation_category['collisions'], y=violation_category['pcf_violation_category'] )


# Add label for vertical axis
plt.xlabel("Frequency")
plt.ylabel("Violation category")

## Party types
What were the party types in the collisions

In [None]:
parties = pd.read_sql_query("SELECT COUNT(*) AS parties, party_type from parties GROUP BY party_type ORDER BY parties", con)


In [None]:
# Bar chart showing frquency of party types
plt.figure(figsize=(10,6))

# Add title
plt.title("Party type involved with collisions")

plt.ticklabel_format(style='plain', axis='y')
sns.barplot(x=parties['party_type'], y=parties['parties'] )


# Add label for vertical axis
plt.ylabel("Frequency")
plt.xlabel("Party Type")



In [None]:
# Close SQLite connection
con.close()