In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from functools import partial
from shapely.ops import transform
import pyproj
import math
from shapely.ops import cascaded_union
from sklearn.cluster import DBSCAN
import mplleaflet
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn import metrics
from matplotlib.colors import LinearSegmentedColormap

In [2]:
def load_data():
    #Loads the CSV files and appends them into a single DataFrame
    column_types = {'Accident_Index': np.string_, 'LSOA_of_Accident_Location': np.string_}
    data13 = pd.read_csv('/home/shreyas/Desktop/major2/data/DfTRoadSafety_Accidents_2013.csv', dtype=column_types)
    data14 = pd.read_csv('/home/shreyas/Desktop/major2/data/DfTRoadSafety_Accidents_2014.csv', dtype=column_types)
    data15 = pd.read_csv('/home/shreyas/Desktop/major2/data/DfTRoadSafety_Accidents_2015.csv', dtype=column_types)
    data16 = pd.read_csv('/home/shreyas/Desktop/major2/data/DftRoadSafety_Accidents_2016.csv', dtype=column_types)
    return data16.append(data15.append(data14.append(data13)))

In [3]:
data = load_data()

In [4]:
data.shape

(561689, 32)

In [5]:
data=data[pd.to_numeric(data['Latitude'], errors='coerce').notnull()]  
data=data[pd.to_numeric(data['Longitude'], errors='coerce').notnull()] 
data.Latitude.astype('float64')
data.Longitude.astype('float64')
data.describe()


Unnamed: 0,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Day_of_Week,Local_Authority_(District),...,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident
count,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0,...,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0,561654.0
mean,446806.88088,290595.4,-1.332075,52.502879,30.060094,2.830483,1.837024,1.32767,4.105846,345.17277,...,358.135359,0.008493,0.836978,1.95316,1.526707,1.316654,0.095529,0.064883,1.347249,1.203889
std,95247.243189,158406.5,1.398938,1.426531,25.278252,0.405227,0.711631,0.811918,1.914869,258.773383,...,1256.923309,0.127801,1.930658,1.654816,1.575345,0.584271,0.68651,0.594162,0.476242,0.412539
min,66435.0,10290.0,-7.48941,49.912941,1.0,1.0,1.0,1.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,384599.25,176545.2,-2.231652,51.474671,6.0,3.0,1.0,1.0,2.0,106.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
50%,451293.5,241790.0,-1.23975,52.057233,30.0,3.0,2.0,1.0,4.0,324.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
75%,526790.0,391268.0,-0.171623,53.4154,45.0,3.0,2.0,1.0,6.0,517.0,...,0.0,0.0,0.0,4.0,1.0,2.0,0.0,0.0,2.0,1.0
max,655309.0,1197917.0,1.758797,60.661117,98.0,3.0,67.0,93.0,7.0,941.0,...,9999.0,2.0,8.0,7.0,9.0,5.0,7.0,7.0,2.0,3.0


In [6]:
# Create the radian longitude and latitude columns


data['rad_lng'] = data['Longitude'] * math.pi / 180.0
data['rad_lat'] = data['Latitude'] * math.pi / 180.0

In [7]:
eps_in_meters = 50.0
num_samples = 10
earth_perimeter = 40070000.0  # In meters
eps_in_radians = eps_in_meters / earth_perimeter * (2 * math.pi)


In [8]:
data['cluster'] = DBSCAN(eps=eps_in_radians, min_samples=num_samples, metric='haversine').fit_predict(data[['rad_lat', 'rad_lng']])

In [9]:
labels= data['cluster'].to_frame()['cluster'].to_numpy()   #Dataframe to numpy array
print(labels)
print(len(labels))
type(labels)

[ 0 -1 -1 ... -1 -1 -1]
561654


numpy.ndarray

In [12]:
len(set(labels))

3833

In [13]:
labels_true=data['Accident_Severity'].to_frame()['Accident_Severity'].to_numpy()   #Dataframe to numpy array
print(len(labels_true))
print(labels_true)

561654
[3 3 3 ... 3 2 3]


In [14]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))

Homogeneity: 0.017
Completeness: 0.006
V-measure: 0.009
