In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import matplotlib.pyplot as plt
import folium
from folium.plugins import FastMarkerCluster

In [2]:
 df = pd.read_csv('GSAF5.csv')

In [3]:
df.shape

(25788, 24)

In [4]:
df = df.loc[df['Year'] >= 1980]
df = df.loc[df['Country'] == 'USA']

In [5]:
df.shape

(1608, 24)

In [6]:
df['address']=df['Location']+','+df['Country'] #df['Location']+','+df['Area']+','+df['Country']

In [7]:
df['Latitude']='NaN'
df['Longitude']='NaN'

In [8]:
for index, row in df.iterrows():
    address = row['address']
    #geolocator = Nominatim()
    geolocator=Nominatim(user_agent="Shark_attack", timeout=10)
    location = geolocator.geocode(address)
    if location is None:
        df.loc[index, 'Latitude'] = 'NaN'
        df.loc[index, 'Longitude'] = 'NaN'
    else:
        df.loc[index, 'Latitude'] = location.latitude
        df.loc[index, 'Longitude'] = location.longitude 

In [9]:
df['Latitude'] = df['Latitude'].replace('NaN', np.nan)
df['Longitude'] = df['Longitude'].replace('NaN', np.nan)

In [10]:
df = df.dropna(axis=0, subset=['Latitude'])
df = df.dropna(axis=0, subset=['address'])

In [11]:
df.shape

(1154, 27)

In [12]:
df[["Longitude", "Latitude"]] = df[["Longitude", "Latitude"]].apply(pd.to_numeric)

In [13]:
df.to_csv('shark_1980_USA.csv')

In [14]:
latitude = 0 #location.latitude
longitude = 0 #location.longitude

In [15]:
# create map of Portland with Breweries using latitude and longitude values
map_Sharks = folium.Map(location=[latitude, longitude], zoom_start=2)

# add markers to map
for lat, lng, date, Species, address in zip(df['Latitude'], df['Longitude'], df['Date'], df['Species '], df['address']):
    label = '{}, {}, {}'.format(date, Species, address)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Sharks)  
    
map_Sharks

In [16]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23', 'address', 'Latitude', 'Longitude'],
      dtype='object')

In [17]:
X =df.drop(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23', 'address'], axis = 1).values

In [71]:
epsilon = 0.8
minimumSamples = 25

In [72]:
from sklearn.cluster import OPTICS
import sklearn.utils
from sklearn.preprocessing import StandardScaler
sklearn.utils.check_random_state(1000)
X = StandardScaler().fit_transform(X)

# Compute OPTICS
db = OPTICS(eps=epsilon, min_samples=minimumSamples).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_+2
df["Clus_OP"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 

# A sample of clusters
df[['Date', 'Species ', 'address',"Clus_OP"]].head(10)

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Unnamed: 0,Date,Species,address,Clus_OP
0,31-Jul-20,,"New Smyrna Beach, Volusia County,USA",2
1,31-Jul-20,Blacktip or Spinner shark,"Orange Beach, Baldwin County,USA",1
3,29-Jul-20,"Lemon shark, 8'","Florida Keys,USA",10
6,27-Jul-20,"White shark, 11'?","Bailey Island, Cumberland County,USA",1
7,23-Jul-20,,"New Smyrna Beach, Volusia County,USA",2
9,19-Jul-20,,"Cocoa Beach, Brevard County,USA",6
14,22-Jun-20,Juvenile bull shark?,"Homestead, Miami-Dade County,USA",1
15,20-Jun-20,,"Salvo, Dare County,USA",1
20,4-Jun-20,Shark involvement unconfirmed but considered p...,"Herring Point, Sussex County,USA",1
29,1-May-20,5' to 6' shark,"Summerland, Santa Barbara County,USA",13


In [73]:
set(labels)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}

In [74]:
num_cluster = max(set(labels))

In [75]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [76]:
# create map of Portland with Breweries using latitude and longitude values
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=1)

# set color scheme for the clusters
x = np.arange(num_cluster)
ys = [i + x + (i*x)**2 for i in range(num_cluster)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map  
markers_colors = []
for lat, lng, date, Species, address, cluster in zip(df['Latitude'], df['Longitude'], df['Date'], df['Species '], df['address'], df['Clus_OP']):
    label = '{}, {}, {}, {}'.format(date, Species, address, cluster)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-2],
        fill=True,
        fill_color=rainbow[cluster-2],
        fill_opacity=0.8).add_to(map_clusters) 
                              
map_clusters

In [77]:
df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23,address,Latitude,Longitude,Clus_OP
0,2020.07.31.c,31-Jul-20,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Megan Tossi,F,...,http://sharkattackfile.net/spreadsheets/pdf_di...,2020.07.31.c,2020.07.31.c,6551.0,,,"New Smyrna Beach, Volusia County,USA",29.025813,-80.927127,2
1,2020.07.31.b,31-Jul-20,2020.0,Unprovoked,USA,Alabama,"Orange Beach, Baldwin County",Swimming,Max Chilton,M,...,http://sharkattackfile.net/spreadsheets/pdf_di...,2020.07.31.b,2020.07.31.b,6550.0,,,"Orange Beach, Baldwin County,USA",30.282869,-87.624195,1
3,2020.07.29.b,29-Jul-20,2020.0,Unprovoked,USA,Florida,Florida Keys,Diving,Justin Stuller,M,...,http://sharkattackfile.net/spreadsheets/pdf_di...,2020.07.29.b,2020.07.29.b,6548.0,,,"Florida Keys,USA",24.667088,-81.583142,10
6,2020.07.27.a,27-Jul-20,2020.0,Unprovoked,USA,Maine,"Bailey Island, Cumberland County",Swimming,Julie Dimperio Holowachÿ,F,...,http://sharkattackfile.net/spreadsheets/pdf_di...,2020.07.27.a,2020.07.27.a,6545.0,,,"Bailey Island, Cumberland County,USA",43.733562,-69.99435,1
7,2020.07.23,23-Jul-20,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Standing,Carson Dicks,M,...,http://sharkattackfile.net/spreadsheets/pdf_di...,2020.07.23,2020.07.23,6544.0,,,"New Smyrna Beach, Volusia County,USA",29.025813,-80.927127,2


In [78]:
df['Clus_OP'].value_counts()

1     428
2     179
13     67
9      58
11     47
14     45
6      40
15     39
5      38
4      36
3      36
8      34
7      30
12     26
10     26
16     25
Name: Clus_OP, dtype: int64

In [79]:
df2 = df[df.Clus_OP != 1]

In [80]:
df2.shape

(726, 28)

In [81]:
df3 = pd.DataFrame(df2['Clus_OP'].value_counts())
df3 = df3.reset_index()
df3.columns = ['Clus_OP','Count']
df3.head()

Unnamed: 0,Clus_OP,Count
0,2,179
1,13,67
2,9,58
3,11,47
4,14,45


In [82]:
#### take average of lattitude and longitude based off of Clus_OP in df.  Then put that value in d3.

In [83]:
df4 = df2.groupby('Clus_OP', as_index=False)['Latitude'].mean()
df4.head()

Unnamed: 0,Clus_OP,Latitude
0,2,29.025722
1,3,29.088538
2,4,29.203099
3,5,30.341784
4,6,28.314132


In [84]:
df5 = df2.groupby('Clus_OP', as_index=False)['Longitude'].mean()

In [85]:
df3.set_index('Clus_OP',inplace=True)
df4.set_index('Clus_OP',inplace=True)
df5.set_index('Clus_OP',inplace=True)
df3 = pd.concat([df3,df4,df5],axis=1,sort=False).reset_index()

In [97]:
df3.head(17)

Unnamed: 0,Clus_OP,Count,Latitude,Longitude
0,2,179,29.025722,-80.926855
1,3,36,29.088538,-80.946618
2,4,36,29.203099,-81.013944
3,5,38,30.341784,-81.435778
4,6,40,28.314132,-80.610046
5,7,30,28.082866,-80.564845
6,8,34,27.275351,-80.238358
7,9,58,26.798031,-80.077333
8,10,26,24.74618,-81.243905
9,11,47,32.630138,-80.07035


In [96]:
print(df3.Count.min())
print(df3.Count.max())
print(df3.Clus_OP.max())

25
179
16


In [95]:
# create map of Portland with Breweries using latitude and longitude values
map_Sharks = folium.Map(location=[latitude, longitude], zoom_start=2)

# add markers to map
for lat, lng, Count in zip(df3['Latitude'], df3['Longitude'], df3['Count']):
    label = '{}'.format(Count)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=Count/10,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Sharks)  
    
map_Sharks

In [98]:
df.to_csv('sharksCords.csv')