In [1]:
# Import Dependencies
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Set File path for original Data Set
shark_attack_csv = 'data_files/clean_data.csv'

In [3]:
# Read CSV into a Data Frame
shark_df = pd.read_csv(shark_attack_csv, encoding='iso-8859-1')

In [4]:
#View Sample
shark_df.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Day,Month,Lat,Lng
0,25-Jun-18,2018,Boating,USA,California,"Oceanside, San Diego County",BOARDING,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18.0,White shark,25.0,Jun,33.19587,-117.379483
1,18-Jun-18,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",SWIMMING,F,11.0,Minor injury to left thigh,N,14.0,Non-Disclosed,18.0,Jun,31.159591,-81.388552
2,9-Jun-18,2018,Invalid,USA,Hawaii,"Habush, Oahu",SURFING,M,48.0,Injury to left lower leg from surfboard skeg,N,7.0,Non-Disclosed,9.0,Jun,21.305612,-158.03019
3,8-Jun-18,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,SURFING,M,,Minor injury to lower leg,N,,Non-Disclosed,8.0,Jun,-30.068611,153.195278
4,4-Jun-18,2018,Provoked,MEXICO,Colima,La Ticla,DIVING,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,Tiger shark,4.0,Jun,18.454353,-103.554131


In [20]:
total_attacks = shark_df['Date'].count()
first_date = shark_df['Year'].min()
last_date = shark_df['Year'].max()
total_years = last_date - first_date
species_count = len(shark_df['Species'].unique())
countries = len(shark_df['Country'].unique())
usa_attacks = len(shark_df['Country'].loc[shark_df['Country'] == 'USA'])
non_disclosed = len(shark_df['Country'].loc[shark_df['Species'] == 'Non-Disclosed'])
white_count = len(shark_df['Species'].loc[shark_df['Species'] == 'White shark'])
tiger_count = len(shark_df['Species'].loc[shark_df['Species'] == 'Tiger shark'])
bull_count = len(shark_df['Species'].loc[shark_df['Species'] == 'Bull shark'])
zambezi_count = len(shark_df['Species'].loc[shark_df['Species'] == 'Zambezi shark'])

In [6]:
shark_df['Species'].value_counts()

Non-Disclosed            2103
White shark               534
Invalid                   333
Tiger shark               195
Bull shark                152
                         ... 
Whaler shark"               1
Leopard shark               1
Dog shark                   1
Bonnethead shark            1
Spotted Dogfish shark       1
Name: Species, Length: 66, dtype: int64

In [7]:
print('Summary of Shark Attacks')
print(f'Total of {total_attacks} over a {total_years} year span.')
print(f'The first attack was in {first_date} and the most recent in {last_date}.')
print(f'The attacks were reported in {countries} different countries with {usa_attacks} in the USA.')
print(f'{species_count} different species were reported in total.')
print(f'In {round(non_disclosed/total_attacks*100, 2)}% of reported attacks the species is unknown.')
print(f'The most common species of shark in attacks is the "White Shark" at {round(white_count/total_attacks*100, 2)}%.')

Summary of Shark Attacks
Total of 4033 over a 58 year span.
The first attack was in 1960 and the most recent in 2018.
The attacks were reported in 129 different countries with 1756 in the USA.
66 different species were reported in total.
In 52.14% of reported attacks the species is unknown.
The most common species of shark in attacks is the "White Shark" at 13.24%.


In [10]:
fatality_df = shark_df.loc[shark_df['Fatal (Y/N)'] == 'Y']

In [21]:
white_fat = len(fatality_df['Species'].loc[fatality_df['Species'] == 'White shark'])
tiger_fat = len(fatality_df['Species'].loc[fatality_df['Species'] == 'Tiger shark'])
bull_fat = len(fatality_df['Species'].loc[fatality_df['Species'] == 'Bull shark'])
zambezi_fat = len(fatality_df['Species'].loc[fatality_df['Species'] == 'Zambezi shark'])

In [22]:
percent_white = round(white_fat / white_count * 100, 2)
percent_tiger = round(tiger_fat / tiger_count * 100, 2)
percent_bull = round(bull_fat / bull_count * 100, 2)
percent_zambezi = round(zambezi_fat / zambezi_count * 100, 2)
print(f'White: {percent_white}\nTiger: {percent_tiger}\nBull: {percent_bull}\nZambezi: {percent_zambezi}')

White: 18.35
Tiger: 24.1
Bull: 17.11
Zambezi: 30.0


In [19]:
print(f'White: {white_count}\nTiger: {tiger_count}\nBull: {bull_count}\nZambezi: {zambezi_count}')

White: 534
Tiger: 195
Bull: 152
Zambezi: 30


In [25]:
surfing_count = len(shark_df['Activity'].loc[shark_df['Activity'] == 'SURFING'])
swimming_count = len(shark_df['Activity'].loc[shark_df['Activity'] == 'SWIMMING'])
fishing_count = len(shark_df['Activity'].loc[shark_df['Activity'] == 'FISHING'])
diving_count = len(shark_df['Activity'].loc[shark_df['Activity'] == 'DIVING'])
print(f'Surfing: {surfing_count}\nSwimming: {swimming_count}\nFishing: {fishing_count}\nDiving: {diving_count}')

Surfing: 1173
Swimming: 1042
Fishing: 736
Diving: 348


In [27]:
jan = len(shark_df['Month'].loc[shark_df['Month'] == 'Jan'])
feb = len(shark_df['Month'].loc[shark_df['Month'] == 'Feb'])
mar = len(shark_df['Month'].loc[shark_df['Month'] == 'Mar'])
apr = len(shark_df['Month'].loc[shark_df['Month'] == 'Apr'])
may = len(shark_df['Month'].loc[shark_df['Month'] == 'May'])
jun = len(shark_df['Month'].loc[shark_df['Month'] == 'Jun'])
jul = len(shark_df['Month'].loc[shark_df['Month'] == 'Jul'])
aug = len(shark_df['Month'].loc[shark_df['Month'] == 'Aug'])
sep = len(shark_df['Month'].loc[shark_df['Month'] == 'Sep'])
octo = len(shark_df['Month'].loc[shark_df['Month'] == 'Oct'])
nov = len(shark_df['Month'].loc[shark_df['Month'] == 'Nov'])
dec = len(shark_df['Month'].loc[shark_df['Month'] == 'Dec'])

print(f'Q1: {jan + feb + mar}\nQ2: {apr + may + jun}\nQ3: {jul + aug + sep}\nQ4: {octo + nov + dec}')

Q1: 857
Q2: 962
Q3: 1232
Q4: 857
