# Shark Atttack Visualization

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Cleaning the Data

In [None]:
attack_df = pd.read_csv('/kaggle/input/shark-attack-dataset/attacks.csv')
attack_df.head()

In [None]:
# There is a lot of missing value at the end of this dataset!
attack_df.tail()

In [None]:
attack_df.isna().sum()

In [None]:
attack_df[attack_df['Date'].isna()].sample(10)

In [None]:
# I will drop row that contains NaN value using Date as a base
attack_df = attack_df[~attack_df['Date'].isna()]
attack_df.isna().sum()

In [None]:
attack_df.columns

In [None]:
# For this visualization, I want Date to Species columns only 
attack_df = attack_df.loc[:,"Case Number":"Species "]
attack_df.head()

# Year of the Attack

In [None]:
# Checking the Year Column
attack_df['Year'].unique()

# There are wired input like 0 500 and so on. Lets look into that.

In [None]:
attack_df[attack_df['Year'] < 1543][['Case Number','Date']]

# Given that year can be extract from either Case Number or Date column, with these given input,
# mostly in range. I will not include these data points in the visulization

In [None]:
attack_df = attack_df[attack_df['Year'] >= 1543]
attack_df['Year'].unique()

# No longer have wired year input

In [None]:
byYear_attack = attack_df.groupby('Year')['Date'].count().reset_index()
fig = px.line(byYear_attack,x='Year', y='Date', title='Shark Attack by Year')
fig.show()

In [None]:
# Close up from 1850 to 2018
byYear_attack = attack_df.groupby('Year')['Date'].count().reset_index()
fig = px.line(byYear_attack.iloc[77:,:],
              x='Year', y='Date', title='Shark Attack from 1853 to 2018')
fig.show()

# Sex

In [None]:
# Removing the Space in Sex column name
attack_df = attack_df.rename(columns={'Sex ':'Sex'})
attack_df.columns

In [None]:
# Checking the unique value of Sex
attack_df['Sex'].unique()

# F and M represents Female and Male respectively
# There are a few of uncategories letter - fixing that 

In [None]:
# Current Count 
attack_df['Sex'].value_counts()

In [None]:
# Modifying the value in Sex column
# Fill NaN value first
attack_df['Sex'] = attack_df['Sex'].fillna("Unknown")

# Male
attack_df.loc[attack_df['Sex'].str.contains("M|M "), 'Sex'] = "Male"

# Female
attack_df.loc[attack_df['Sex'].str.contains("F"), 'Sex'] = "Female"

# Value other than Female and Male becomes Unspecified
attack_df.loc[~attack_df['Sex'].str.contains("Male|Female"), 'Sex'] = "Unknown"

bySex_count = attack_df['Sex'].value_counts().reset_index().rename(columns={'index':'Gender','Sex':'Count'})
bySex_count

In [None]:
# Distribution Male, Female and Unspecified
fig = px.pie(data_frame = bySex_count,
             values = 'Count',
             names = 'Gender',
             title = 'Shark Attack by Gender',
             color_discrete_sequence=px.colors.qualitative.Pastel
             )

fig.update_traces(textposition ='outside',
                  textinfo = 'label+percent')
fig.update_layout(paper_bgcolor='cornsilk',
                  legend_title = 'Gender',
                  font = dict(
                      family = "Courier New, monospace",
                      size = 18,
                      color = 'black'
                  ))

fig.show()

# Activity and Type

In [None]:
# Checking count for Activity
attack_df['Activity'].value_counts().head()

In [None]:
attack_df['Type'].value_counts()

In [None]:
# After looking at the type of the attack, it seems that "Boating", "Boat", and "Boatomg" meant the same type 
# of attack, I will change the type of attack to just Boat

attack_df.loc[(attack_df['Type'] == 'Boating') | (attack_df['Type'] == 'Boatomg'), "Type"] = "Boat"
byType_count = attack_df['Type'].value_counts().reset_index().rename(columns={'Type':'Count','index':'Type'})
byType_count

In [None]:
# Distribution Type of Attack
fig = px.pie(data_frame = byType_count,
             values = 'Count',
             names = 'Type',
             title = 'Shark Attack Type',
             color_discrete_sequence=px.colors.qualitative.Safe
             )

fig.update_traces(textposition ='inside',
                  textinfo = 'label+percent')
fig.update_layout(paper_bgcolor='cornsilk',
                  legend_title = 'Type',
                  uniformtext_minsize=18,
                  uniformtext_mode='hide',
                  font = dict(
                      family = "Courier New, monospace",
                      size = 18,
                      color = 'black'
                  ))

fig.show()

In [None]:
attack_df.groupby(['Type','Activity'])['Date'].count()

# Shark Species

In [None]:
attack_df.rename(columns={'Species ':'Species'},inplace=True)
attack_df['Species'].value_counts()

In [None]:
import re

# Filling NaN value with string "Unknown shark"
attack_df['Species'] = attack_df['Species'].fillna('Unknown shark')

# Create new column to stored the extracted Shark species
attack_df['New Species'] = None

# Regular Expression pattern used to extract shark species (ultimately getting everything infront of the word "shark")
shark_pattern = r'.* (shark|Shark)'

# Looping through every row in the dataFrame
for row in range(len(attack_df)):
    try:
        shark_species = re.search(shark_pattern, attack_df.iat[row, attack_df.columns.get_loc('Species')]).group()
        attack_df.iat[row, attack_df.columns.get_loc('New Species')] = shark_species
    except:
        # Assuming that if there is nothing before string shark OR doesn't contain string shark at all,
        # then I will put the string Shark involvement not confirmed instead
        attack_df.iat[row, attack_df.columns.get_loc('New Species')] = "Shark involvement not confirmed"

bySpecies_count = attack_df['New Species'].value_counts().reset_index().rename(columns={'New Species':'Count','index':'Species'})
bySpecies_count.head()

In [None]:
# Visualization - I won't be including first two rows of this dataFrame because the species of Shark is unknwon
fig = px.pie(data_frame = bySpecies_count.iloc[2:12,:],
             values = 'Count',
             names = 'Species',
             title = 'Top 10 Shark Species',
             color_discrete_sequence=px.colors.qualitative.Prism
             )

fig.update_traces(textposition ='outside',
                  textinfo = 'percent', pull = [0.1]*10)
fig.update_layout(paper_bgcolor='cornsilk',
                  legend_title = 'Shark Type',
                  font = dict(
                      family = "Courier New, monospace",
                      size = 18,
                      color = 'black'
                  ))

fig.show()

# Country

In [None]:
byCountry_count = attack_df['Country'].value_counts().reset_index().rename(columns={'Country':'Count','index':'Country'})
byCountry_count.head()

In [None]:
# World Map Plot
fig = px.choropleth(data_frame = byCountry_count,
                    locations = 'Country',
                    color = 'Count',
                    locationmode = 'country names',
                    scope = 'world',
                    title = 'Shark Attack around the World')

fig.show()

In [None]:
# Top 5 Country with highest number of Shark Attack

plt.figure(dpi=120)
sns.barplot(data=byCountry_count.head(5), y='Country', x='Count', palette='Set2')
plt.title('Top 5 Countries with Highest number of Shark Attack cases')
plt.show()

### Shark Attack in the USA

In [None]:
byAreaUS_count = attack_df[attack_df['Country'] == "USA"]['Area'].value_counts().reset_index().rename(columns={'Area':'Count','index':'Area'})
byAreaUS_count.head()

In [None]:
# Changing Area to State Code
# got this dict of states from this medium post (here is the link): 
# https://medium.com/geekculture/create-a-choropleth-map-of-state-unemployment-rates-with-plotly-1354050e0cfd

states_code = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA',
               'Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','District of Columbia': 'DC',
               'Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN',
               'Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD',
               'Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO',
               'Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ',
               'New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH',
               'Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC',
               'South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA',
               'Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}

byAreaUS_count['State Code'] = byAreaUS_count['Area'].map(states_code)
byAreaUS_count.head()

In [None]:
# Shark Attack in the USA
fig = px.choropleth(data_frame = byAreaUS_count,
                    locations = 'State Code',
                    color = 'Count',
                    locationmode = 'USA-states',
                    scope = 'usa',
                    title = 'Shark Attack in the USA',
                    hover_name = 'Area')

fig.show()

# Condition Fatal (Y/N)

In [None]:
attack_df['Fatal (Y/N)'].value_counts()

In [None]:
attack_df['Fatal (Y/N)'].unique()

In [None]:
attack_df['Fatal (Y/N)'].value_counts()

In [None]:
# Filling in NaN value with UNKNOWN
attack_df['Fatal (Y/N)'] = attack_df['Fatal (Y/N)'].fillna('UNKNOWN')

# Strip Space before N
attack_df['Fatal (Y/N)'] = attack_df['Fatal (Y/N)'].str.strip()

# Changing N to NO
attack_df.loc[attack_df['Fatal (Y/N)'] == "N", 'Fatal (Y/N)'] = 'NO'

# Changing Y to YES and adding them up
attack_df.loc[attack_df['Fatal (Y/N)'].str.contains("Y|y"), 'Fatal (Y/N)'] = "YES"

# Change all other input that is not YES or NO to UNKNOWN
attack_df.loc[~attack_df['Fatal (Y/N)'].str.contains("YES|NO")] = "UNKNOWN"

byFatal_count = attack_df['Fatal (Y/N)'].value_counts().reset_index().rename(columns = {'Fatal (Y/N)':'Count','index':'Fatality Status'})
byFatal_count

In [None]:
# Distribution Type of Attack
fig = px.pie(data_frame = byFatal_count,
             values = 'Count',
             names = 'Fatality Status',
             title = 'Fatality Status after Shark Attacks',
             color_discrete_sequence=px.colors.qualitative.Safe
             )

fig.update_traces(textposition ='inside',
                  textinfo = 'label+percent')
fig.update_layout(paper_bgcolor='cornsilk',
                  legend_title = 'Fatality Status',
                  uniformtext_minsize=18,
                  uniformtext_mode='hide',
                  font = dict(
                      family = "Courier New, monospace",
                      size = 18,
                      color = 'black'
                  ))

fig.show()

# Reported Time of the Incident

In [None]:
attack_df['Time'].value_counts().head()

In [None]:
# Filling NaN value with string "Unknown shark"
attack_df['Time'] = attack_df['Time'].fillna('Unknown')

# Create new column to stored the extracted Shark species
attack_df['New Time'] = None

# Regular Expression pattern used to extract shark species (ultimately getting everything infront of the word "shark")
time_pattern = r'[0-9]{2}h[0-9]{2}'

# Looping through every row in the dataFrame
for row in range(len(attack_df)):
    try:
        time = re.search(time_pattern, attack_df.iat[row, attack_df.columns.get_loc('Time')]).group()
        attack_df.iat[row, attack_df.columns.get_loc('New Time')] = time
    except:
        # Putting in the orignial value from the original Date column
        attack_df.iat[row, attack_df.columns.get_loc('New Time')] = attack_df['Time'].iloc[row]


byTime_count = attack_df['New Time'].value_counts().reset_index().rename(columns={'New Time':'Count','index':'Time'})
byTime_count.head()

In [None]:
# Distribution Time of Shark Attack
fig = px.pie(data_frame = byTime_count.iloc[1:11,:],
             values = 'Count',
             names = 'Time',
             title = 'Time of Shark Attack',
             color_discrete_sequence=px.colors.qualitative.Set2
             )

fig.update_traces(textposition ='outside',
                  textinfo = 'label+percent',
                  pull = [0.1] * 10)
fig.update_layout(paper_bgcolor='cornsilk',
                  legend_title = 'Time of Incident',
                  uniformtext_minsize=18,
                  uniformtext_mode='hide',
                  font = dict(
                      family = "Courier New, monospace",
                      size = 18,
                      color = 'black'
                  ))

fig.show()

# Thank you for checking out my nootebook