In [1]:
import pandas as pd
import numpy as np
from functools import reduce
import random
import os

# Basic knowledge: 
To begin this project, it would be good to hold a minimum understanding of the subject we will be analyzing. As I did not know much about this topic at the day the project started, I have recurred to the shark-attack wiki: https://en.wikipedia.org/wiki/Shark_attack



# Defining the dataset path, and importing it to begin basic dataset exploration

Questions: 
- how to add the dataset to gitignore?

In [2]:
# To follow along and access the DataSet, download it from KAGGLE using this link
# https://www.kaggle.com/teajay/global-shark-attacks

# Once you have downloaded the DataSet, change the dataset variable to match the 
# path where you have saved the 'attacks.csv' file.
dataset = 'attacks.csv' 
df = pd.read_csv(dataset, encoding='latin-1')

Now, we will check some basic information about the dataset, in order to formulate a more educated hypothesis which we could actually put to test with the data available.

In [3]:
display(df.shape)# To know the shape of the DF
print(df.drop_duplicates().shape) # Shape when eliminating duplicates

(25723, 24)

(6312, 24)


Here, I notice that the shape of the df with no duplicates is very small when compared to the whole df. This seems weird and since the `drop_duplicates` function ignores time indexes, I'll try to compare both dataframes' time data

In [4]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [5]:
df.Date

0        25-Jun-2018
1        18-Jun-2018
2        09-Jun-2018
3        08-Jun-2018
4        04-Jun-2018
            ...     
25718            NaN
25719            NaN
25720            NaN
25721            NaN
25722            NaN
Name: Date, Length: 25723, dtype: object

In [6]:
df_nodupes = df.drop_duplicates()
df_nodupes.Date

0        25-Jun-2018
1        18-Jun-2018
2        09-Jun-2018
3        08-Jun-2018
4        04-Jun-2018
            ...     
6307             NaN
6308             NaN
6309             NaN
8702             NaN
25722            NaN
Name: Date, Length: 6312, dtype: object

In [7]:
# Now, with a df smaller in size, I want to see what info is there on the last couple
# of columns which have unexplicit
df_nodupes[['Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23']]

Unnamed: 0,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...
6307,,,6309.0,,
6308,,,6310.0,,
6309,,,,,
8702,,,,,


In [8]:
# Too many null values... let's count them and let
print(df_nodupes.shape)
df_nodupes[['Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23']].isnull().sum()

(6312, 24)


Case Number.1       10
Case Number.2       10
original order       3
Unnamed: 22       6311
Unnamed: 23       6310
dtype: int64

In [9]:
# If only 1 value in the 'Unnamed: 22' column, and 2 values in the
# 'Unnamed: 22' column, I'll not consider this data for my analysis.
df_nodupes = df_nodupes.drop(columns=['Unnamed: 22', 'Unnamed: 23'])

In [10]:
# Now we'll look at the columns again
df_nodupes.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

In [11]:
# The following columns seemed a little bit rare, so i do a value count to find out what they are about
df_nodupes['Case Number.1'].value_counts()

1920.00.00.b    2
2005.04.06      2
1913.08.27.R    2
2014.08.02      2
1923.00.00.a    2
               ..
2017.08.10      1
1955.09.23.b    1
1962.07.28      1
1966.07.17.b    1
1967.04.00      1
Name: Case Number.1, Length: 6285, dtype: int64

In [12]:
df_nodupes['Case Number.2'].value_counts()

2013.10.05        2
1915.07.06.a.R    2
1966.12.26        2
1907.10.16.R      2
2009.12.18        2
                 ..
2005.05.15        1
1987.04.15        1
2017.02.25        1
1900.09.05        1
1967.04.00        1
Name: Case Number.2, Length: 6286, dtype: int64

In [13]:
df_nodupes['original order'].value_counts()

569.0     2
4603.0    1
4899.0    1
810.0     1
796.0     1
         ..
3508.0    1
3256.0    1
3106.0    1
3080.0    1
6272.0    1
Name: original order, Length: 6308, dtype: int64

In [14]:
#Since 'original order seems like arbitrary indexes, i'll drop it
df_nodupes = df_nodupes.drop(columns='original order')
df_nodupes.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2'],
      dtype='object')

In [15]:
# I want to check out some of the pdf and href
print(df_nodupes[['pdf', 'href']].isnull().sum())
df_nodupes[['pdf', 'href']]

pdf     10
href    10
dtype: int64


Unnamed: 0,pdf,href
0,2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
1,2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
3,2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
4,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...
6307,,
6308,,
6309,,
8702,,


In [16]:
# I want to check what is on those pdfs
# A random sample of the column 

# With a FOR loop
for i in range(10):
    e = random.choice(range(1000))
    print(f"index: {e}, link: {df_nodupes.iloc[e]['href']}")

# With Random sample
display(random.sample(list(df_nodupes['href']), 10))

# Below are a couple of the links, when I open them, I have found that they are seem quite structured
# It could be possible to parse them later down the road and use a REGEX to find more data

# I also have ran this column a few times to notice that all pdfs have actually been uploaded to
# the same website and have the same naming structure

index: 530, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2014.07.16-Maui.pdf
index: 144, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2017.05.06-Rozada.pdf
index: 940, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2011.03.10.R-Sharm-scavenging.pdf
index: 351, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2015.10.05.b-FtPierce.pdf
index: 891, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2011.07.30-Malabago.pdf
index: 229, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2016.08.27-Chardard.pdf
index: 162, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2017.04.10.a-Orr.pdf
index: 884, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2011.08.16.c-D'Esposito.pdf
index: 316, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2016.01.23-Mason.pdf
index: 943, link: http://sharkattackfile.net/spreadsheets/pdf_directory/2011.02.23-Jean-Luc.pdf


['http://sharkattackfile.net/spreadsheets/pdf_directory/2011.06.06.R-James.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1972.12.25-Sockoff.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2006.04.09.a-Pereira.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1989.07.27-SovietDiver.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1989.04.00-Allen.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1942.11.13.a-Guadalcanal.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1981.07.07-NV-Cannette.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2005.04.25-deckhand.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/1981.07.01-Wilson.pdf',
 'http://sharkattackfile.net/spreadsheets/pdf_directory/2001.06.12-Black.pdf']

# After some random sampling, I've noticed that the following indexes have some mistakes.. 
# a REGEX can be used to fix problems like these 


In [17]:
print(df_nodupes.iloc[332]['href'])
print(df_nodupes.iloc[324]['href'])
print(df_nodupes.iloc[588]['href'])
print(df_nodupes.iloc[569]['href'])

http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.11.15.a-Engelman.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.12.21.a-Brazil.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2014.00.00.b-OceanicWhitetip.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2014.04.03-Armstrong.pdf


## It looks still like some of these pdfs are duplicates, even after dropping duplicates :

In [18]:
# how many times each pdf on the dataframe
df_pdf = df_nodupes["pdf"]
df_pdf.value_counts()

1906.09.27.R.a&b-Munich-Swede.pdf      2
1916.12.08.a-b-German.pdf              2
1916.07.12.a-b-Stillwell-Fisher.pdf    2
1934.12.23.a-b-Inman.pdf               2
1935.06.05.R-SolomonIslands.pdf        2
                                      ..
1976.03.12.b-GlenWright.pdf            1
2010.08.17-Edwards.pdf                 1
1960.05.01-Taludig.pdf                 1
2012.08.15-GulfShoresSwimmer.pdf       1
2008.07.25.a-Fiack.pdf                 1
Name: pdf, Length: 6291, dtype: int64

In [24]:
# drop dupes and compare lengths
df_pdf_nodupes = df_pdf.drop_duplicates()

len(df_pdf), len(df_pdf_nodupes)

(6312, 6292)

## Since the lengths are not the same, I will check if those duplicated entries are only in this column

In [30]:
df_nodupes[["Date", "Location", "pdf"]]

Unnamed: 0,Date,Location,pdf
0,25-Jun-2018,"Oceanside, San Diego County",2018.06.25-Wolfe.pdf
1,18-Jun-2018,"St. Simon Island, Glynn County",2018.06.18-McNeely.pdf
2,09-Jun-2018,"Habush, Oahu",2018.06.09-Denges.pdf
3,08-Jun-2018,Arrawarra Headland,2018.06.08-Arrawarra.pdf
4,04-Jun-2018,La Ticla,2018.06.04-Ramos.pdf
...,...,...,...
6307,,,
6308,,,
6309,,,
8702,,,


In [None]:
# df_nodupes_indexes 
for e in list(df_nodupes['pdf']):
    if 

In [34]:
dfx = df_nodupes["pdf"].value_counts() if 

KeyError: "None of [Int64Index([2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n            ...\n            1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n           dtype='int64', length=6291)] are in the [columns]"

In [None]:
display(df.columns) # To know which are the columns in the DF
display(df.count()) # To know how much data are we missin on each column
display(df.dtypes)