In [1]:
import pandas as pd
import numpy as np
from functools import reduce
import random
import os

# Basic knowledge: 
To begin this project, it would be good to hold a minimum understanding of `Shark Attacks`.

As I did not know much about this topic at the day the project started, I have recurred to the shark-attack wiki: https://en.wikipedia.org/wiki/Shark_attack



# Defining the dataset path, and importing it to begin basic dataset exploration

Questions: 
- how to add the dataset to gitignore?

In [2]:
# To follow along and access the DataSet, download it from KAGGLE using this link
# https://www.kaggle.com/teajay/global-shark-attacks

# Once you have downloaded the DataSet, change the dataset variable to match the 
# path where you have saved the 'attacks.csv' file.
dataset = 'attacks.csv' 
df = pd.read_csv(dataset, encoding='latin-1')

Now, we will check some basic information about the dataset, in order to formulate a more educated hypothesis which we could actually put to test with the data available.

In [3]:
display(df.shape)# To know the shape of the DF
print(df.drop_duplicates().shape) # Shape when eliminating duplicates

(25723, 24)

(6312, 24)


Here, I notice that the shape of the df with no duplicates is very small when compared to the whole df. This seems weird and since the `drop_duplicates` function ignores time indexes, I'll try to compare both dataframes' time data

In [4]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [5]:

df.Date

0        25-Jun-2018
1        18-Jun-2018
2        09-Jun-2018
3        08-Jun-2018
4        04-Jun-2018
            ...     
25718            NaN
25719            NaN
25720            NaN
25721            NaN
25722            NaN
Name: Date, Length: 25723, dtype: object

In [6]:
df_nodupes = df.drop_duplicates()
df_nodupes.Date

0        25-Jun-2018
1        18-Jun-2018
2        09-Jun-2018
3        08-Jun-2018
4        04-Jun-2018
            ...     
6307             NaN
6308             NaN
6309             NaN
8702             NaN
25722            NaN
Name: Date, Length: 6312, dtype: object

In [7]:
# Now, with a df smaller in size, I want to see what info is there on the last couple
# of columns which have unexplicit
df_nodupes[['Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23']]

Unnamed: 0,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...
6307,,,6309.0,,
6308,,,6310.0,,
6309,,,,,
8702,,,,,


In [8]:
# Too many null values... let's count them and let
print(df_nodupes.shape)
df_nodupes[['Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23']].isnull().sum()

(6312, 24)


Case Number.1       10
Case Number.2       10
original order       3
Unnamed: 22       6311
Unnamed: 23       6310
dtype: int64

In [9]:
# If only 1 value in the 'Unnamed: 22' column, and 2 values in the
# 'Unnamed: 22' column, I'll not consider this data for my analysis.
df_nodupes = df_nodupes.drop(columns=['Unnamed: 22', 'Unnamed: 23'])

In [10]:
# Now we'll look at the columns again
df_nodupes.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

In [11]:
# The following columns seemed a little bit rare, so i do a value count to find out what they are about
df_nodupes['Case Number.1'].value_counts()

1980.07.00      2
2013.10.05      2
2012.09.02.b    2
1913.08.27.R    2
1990.05.10      2
               ..
1959.05.00      1
1989.04.12      1
ND.0011         1
2010.12.17      1
2009.06.02.b    1
Name: Case Number.1, Length: 6285, dtype: int64

In [12]:
df_nodupes['Case Number.2'].value_counts()

1980.07.00      2
2013.10.05      2
1923.00.00.a    2
2012.09.02.b    2
1913.08.27.R    2
               ..
2005.08.14      1
1981.12.13.a    1
1959.05.00      1
1989.04.12      1
2009.06.02.b    1
Name: Case Number.2, Length: 6286, dtype: int64

In [13]:
df_nodupes['original order'].value_counts()

569.0     2
4603.0    1
4899.0    1
810.0     1
796.0     1
         ..
3508.0    1
3256.0    1
3106.0    1
3080.0    1
6272.0    1
Name: original order, Length: 6308, dtype: int64

In [14]:
#Since 'original order seems like arbitrary indexes, i'll drop it
df_nodupes = df_nodupes.drop(columns='original order')
df_nodupes.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2'],
      dtype='object')

In [15]:
# I want to check out some of the pdf and href
print(df_nodupes[['pdf', 'href']].isnull().sum())
df_nodupes[['pdf', 'href']]

pdf     10
href    10
dtype: int64


Unnamed: 0,pdf,href
0,2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
1,2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
3,2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
4,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...
6307,,
6308,,
6309,,
8702,,


In [16]:
# I want to check what is on those pdfs
# A random sample of the column 

# With a FOR loop
for i in range(10):
    e = random.choice(range(1000))
    print(f"index: {e}, link: {df_nodupes.iloc[e]['href']}")

# With Random sample
display(random.sample(list(df_nodupes['href']), 10))

# Below are a couple of the links, when I open them, I have found that they are seem quite structured
# It could be possible to parse them later down the road and use a REGEX to find more data

# I also have ran this column a few times to notice that all pdfs have actually been uploaded to
# the same website and have the same naming structure

index: 707, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2013.01.25-Cassaigne.pdf
index: 480, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.31-Clark.pdf
index: 275, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2016.05.21.b-Magee.pdf
index: 936, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2011.03.23-Pearson.pdf
index: 443, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2015.03.11-Graham.pdf
index: 131, link: http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.05-FrenchPolynesia.pdf
index: 983, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2010.10.01-Melkbaai-surfer.pdf
index: 295, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2013.03.28.a-Smyth.pdf
index: 385, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2015.07.25-Johnson.pdf
index: 971, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2010

['http://sharkattackfile.net/spreadsheets/pdf_directory/1989.02.15-Abel.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1952.00.00.a-PanAmPilot.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2004.01.21.b-Moeller.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1965.05.00.c-Falconer-Barker.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1976.10.27-AlremahII.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2007.07.00-Takyi.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1959.10.07-PortugueseSoldier.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1883.12.19-Lysaght.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2008.05.10-JasonCull.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2013.04.17-Cicarelli.pdf']

# After some random sampling, I've noticed that the following indexes have some mistakes.. 
# a REGEX can be used to fix problems like these 


In [17]:
print(df_nodupes.iloc[332]['href'])
print(df_nodupes.iloc[324]['href'])
print(df_nodupes.iloc[588]['href'])
print(df_nodupes.iloc[569]['href'])

http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.11.15.a-Engelman.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.12.21.a-Brazil.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2014.00.00.b-OceanicWhitetip.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2014.04.03-Armstrong.pdf


## It looks still like some of these pdfs are duplicates, even after dropping duplicates :

In [18]:
# how many times each pdf on the dataframe
df_pdf = df_nodupes["pdf"]
df_pdf.value_counts()

1898.00.00.R-Syria.pdf                2
1935.06.05.R-SolomonIslands.pdf       2
1921.11.27.a-b-Jack.pdf               2
1907.10.16.R-HongKong.pdf             2
1931.09.21.a-b-Holaday-Barrows.pdf    2
                                     ..
1880.00.00.c-Guadalcanal.pdf          1
1868.00.00.b-PindoIslanders.pdf       1
1925.03.12-Canning.pdf                1
1958.10.00.a-EastNakanai.pdf          1
2006.07.31.a-Martin.pdf               1
Name: pdf, Length: 6291, dtype: int64

In [19]:
# drop dupes and compare lengths
df_pdf_nodupes = df_pdf.drop_duplicates()

len(df_pdf) - len(df_pdf_nodupes), 'duped values'

(20, 'duped values')

## Since the lengths are not the same, I will check if those duplicated entries are only in this column

In [20]:
# there are 20 duplicated values on the pdf columns
df_nodupes.duplicated('pdf').value_counts()

False    6292
True       20
dtype: int64

In [21]:
# But only 18 dupes if we take Location into count
df_nodupes.duplicated(['pdf','Location']).value_counts()

False    6294
True       18
dtype: int64

### I'll look at the rest of the data now.

In [22]:
print(df_nodupes.shape)
df_nodupes.duplicated().sum()

(6312, 21)


7

In [23]:
df_nodupes = df_nodupes.drop_duplicates()
df_nodupes.duplicated().sum()

0

In [24]:
print(df_nodupes.shape)
df_nodupes[["Date", "Location", "pdf"]]

(6305, 21)


Unnamed: 0,Date,Location,pdf
0,25-Jun-2018,"Oceanside, San Diego County",2018.06.25-Wolfe.pdf
1,18-Jun-2018,"St. Simon Island, Glynn County",2018.06.18-McNeely.pdf
2,09-Jun-2018,"Habush, Oahu",2018.06.09-Denges.pdf
3,08-Jun-2018,Arrawarra Headland,2018.06.08-Arrawarra.pdf
4,04-Jun-2018,La Ticla,2018.06.04-Ramos.pdf
...,...,...,...
6300,1883-1889,"Panama Bay 8ºN, 79ºW",ND-0002-JulesPatterson.pdf
6301,1845-1853,"Below the English fort, Trincomalee",ND-0001-Ceylon.pdf
6302,,,
8702,,,


In [25]:
df_nodupes.Country.value_counts()

USA                 2229
AUSTRALIA           1338
SOUTH AFRICA         579
PAPUA NEW GUINEA     134
NEW ZEALAND          128
                    ... 
GUATEMALA              1
COOK ISLANDS           1
COMOROS                1
KOREA                  1
THE BALKANS            1
Name: Country, Length: 212, dtype: int64

In [26]:
# While checking the columns 'Species ' and 'Sex ' have unnecesary spaces at the end of the string
# to remove these, and also take out the '(Y/N)' from the column 'Fatal'

In [27]:
df_label = df_nodupes
df_label.columns = ['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal', 'Time',
       'Species', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2']
df_label.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal', 'Time', 'Species',
       'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2'],
      dtype='object')

In [28]:
df_label['Fatal'].value_counts()

N          4293
Y          1388
UNKNOWN      71
 N            7
N             1
y             1
M             1
2017          1
Name: Fatal, dtype: int64

In [29]:
remove_spaces = lambda x:  x.remove(' ') if ' ' in x else x
"""

df_label['Fatal'] = list(
                            map(remove_spaces(
                            df_label['Fatal']),
                            ))
"""
    
df_label['Fatal'].value_counts()


# I want to see the indexes which have a duplicated pdf row
"""

dupes = []
for a,b in list(df_label['pdf'].duplicated().items()):
    if b:
        dupes.append(a)
dupes """

"\n\ndupes = []\nfor a,b in list(df_label['pdf'].duplicated().items()):\n    if b:\n        dupes.append(a)\ndupes "

In [30]:
"""
df_label.loc[dupes]
"""


'\ndf_label.loc[dupes]\n'

In [31]:
# dfx = df_nodupes["pdf"].value_counts() if 

# Transform this to sort the shark species


In [32]:
list(df_label['Species'].value_counts().items())

[('White shark', 163),
 ('Shark involvement prior to death was not confirmed', 105),
 ('Invalid', 102),
 ('Shark involvement not confirmed', 88),
 ('Tiger shark', 73),
 ('Shark involvement prior to death unconfirmed', 68),
 ('Bull shark', 52),
 ("6' shark", 40),
 ("4' shark", 40),
 ('Questionable incident', 35),
 ("1.8 m [6'] shark", 35),
 ('Questionable', 34),
 ("1.5 m [5'] shark", 32),
 ("1.2 m [4'] shark", 27),
 ("3' shark", 26),
 ("5' shark", 26),
 ('2 m shark', 25),
 ("4' to 5' shark", 24),
 ("3 m [10'] shark", 22),
 ('No shark involvement', 21),
 ('Wobbegong shark', 21),
 ("3' to 4' shark", 18),
 ('3 m shark', 17),
 ("2.4 m [8'] shark", 16),
 ("3.7 m [12'] shark", 15),
 ("12' shark", 15),
 ('Blacktip shark', 15),
 ("1.2 m to 1.5 m [4' to 5'] shark", 14),
 ('Blue shark', 14),
 ("7' shark", 13),
 ('"a small shark"', 13),
 ('Mako shark', 13),
 ('1.5 m shark', 13),
 ('Shark involvement prior to death not confirmed', 13),
 ('Raggedtooth shark', 12),
 ("10' shark", 12),
 ("5 m [16.5'] 

In [33]:
# @@ Use this to fill null values: 
# df_clean["drive"] = df_clean.drive.fillna("NoTransmision")

# Injuries and types of attack
The GSAF categorizes scavenging bites on humans as "questionable incidents."

## PROVOKED
Provoked attacks occur when a human touches, hooks, nets, or otherwise aggravates the animal. Incidents that occur outside of a shark's natural habitat, such as aquariums and research holding-pens, are considered provoked, as are all incidents involving captured sharks. Sometimes humans inadvertently provoke an attack, such as when a surfer accidentally hits a shark with a surf board.

## UNPROVOKED
- Hit-and-run attack
- Sneak Attack
- Bump-and-bite attack 

For more information on how to differentiate PROVOKED vs UNPROVOKED attacks :
https://en.wikipedia.org/wiki/Shark_attack#Types_of_attacks

In [34]:
# Since there is no column that states if the attack was provoked or not,
# I want to analyze the injury column to distinguish between the cases that were provoked
# and those that were unprovoked.

random.sample(list(df_label.Injury.value_counts().items()),20)

[('Five men were said to have been killed by sharks ', 1),
 ('Left leg bitten PROVOKED INCIDENT', 1),
 ('Lacerations to 4 toes of right foot', 1),
 ('Unknown, but survived', 1),
 ('FATAL, "caught by legs" ', 1),
 ('No injury, shark tore diving suit', 1),
 ('Punctures to lower left leg & foot', 1),
 ('Lacerations to right hand by hooked shark PROVOKED INCIDENT', 1),
 (' FATAL. Shark bite was minor injury, but he suffered a heart attack afterwards and died 6 hours later',
  1),
 ('No injury, shark bumped leg & board. ', 1),
 ("Speared shark bit diver's right knee, and lacerated right thigh & buttocks PROVOKED INCIDENT",
  1),
 ('Left heel lacerated', 1),
 ('Foot & hand severed', 1),
 ('Laceration to foot from dead shark PROVOKED INCIDENT', 1),
 ('Bottom of left foot gashed', 1),
 ('His boat was holed by a shark', 1),
 ('Thought to have been taken by a shark. Body was not recovered', 2),
 ('FATAL, partial remains recovered ', 1),
 ('Minor cuts & bruises on face & neck', 1),
 ('Right thigh

In [38]:
# Categorizing  Provoked and  Unprovoked attacks
#df_clean.loc[df_clean["trany"].str.startswith("M"),"trany"] = "Manual"

provoked = ['PROVOKED', 'hook', 'shot']
#map(lambda words, x : words in x, provoked, df_nodupes.loc[df_nodupes['Injury'].str])
df_nodupes.loc[df_nodupes['Injury'].str]

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [None]:
df_nodupes.loc[df_nodupes['Injury'].str]

# df_provoked = np.where(df_nodupes.Injury.isin(provoked), True, False) 

# Passing that categorization to a new PROVOKED COLUMN
df_nodupes['Provoked'] = df_provoked
df_nodupes['Provoked'] 

In [None]:
display(df.columns) # To know which are the columns in the DF
display(df.count()) # To know how much data are we missin on each column
display(df.dtypes)

In [56]:
df_label[['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal', 'Time', 'Species',
       'Investigator or Source', 'pdf', 'href',
       'Case Number.1', 'Case Number.2']].head(50)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Investigator or Source,pdf,href,Case Number.1,Case Number.2
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04
5,2018.06.03.b,03-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,,"No injury, board bitten",N,,,"Daily Telegraph, 6/4/2018",2018.06.03.b-FlatRock.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.b,2018.06.03.b
6,2018.06.03.a,03-Jun-2018,2018.0,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18,FATAL,Y,Late afternoon,Tiger shark,"Diario de Pernambuco, 6/4/2018",2018.06.03.a-daSilva.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.a,2018.06.03.a
7,2018.05.27,27-May-2018,2018.0,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,male,M,52,Minor injury to foot. PROVOKED INCIDENT,N,,"Lemon shark, 3'","K. McMurray, TrackingSharks.com",2018.05.27-Ponce.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.27,2018.05.27
8,2018.05.26.b,26-May-2018,2018.0,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Walking,Cody High,M,15,Lower left leg bitten,N,17h00,"Bull shark, 6'","K.McMurray, TrackingSharks.com",2018.05.26.b-High.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.b,2018.05.26.b
9,2018.05.26.a,26-May-2018,2018.0,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",Standing,male,M,12,Minor injury to foot,N,14h00,,"K. McMurray, Tracking Sharks.com",2018.05.26.a-DaytonaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.a,2018.05.26.a
