----------------------------------------------------------------
 DATA CLEANING
----------------------------------------------------------------
----------------------------------------------------------------

In [11]:
# Install to run code in notebook

!pip install plotly.express
!pip install meteostat



In [1]:
# Libraries used for data cleaning

import pandas as pd

In [3]:
df = pd.read_excel("../data/raw/GSAF5.xls")
df.head()

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Species,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22
0,14th October,2025.0,Unprovoked,Columbia,"Bolivar, del Isolate",Catagena Province,Swimming with sharks,Male child,M,14,...,Nurse shark,Kevin McMurray Trackingsharks.com Andy Currie,,,,,,,,
1,11th October,2025.0,Unprovoked,Australia,Queensland,Cook Esplanade Thursday Island,Fishing/swimming,Samuel Nai,M,14,...,Tiger or Bull shark,Kevin McMurray Trackingsharks.com,,,,,,,,
2,7th October,2025.0,Unprovoked,Australia,South Australia,Kangaroo Island,Surfing,Lee Berryman,M,50+,...,Bronze whaler?,Kevin McMurray Trackingsharks.com,,,,,,,,
3,29th September,2025.0,Unprovoked,USA,Off California,Catalina Island,Swimming,Christopher Murray,M,54,...,unknown 1.2m shark,Todd Smith: Kevin McMurray Trackingsharks.com,,,,,,,,
4,27th September,2025.0,Provoked,Costa Rica,,Cocos Islands,Diving-Tagging sharks,Dr. Mauricio Hoyos,M,48,...,Tiger shark 4m,Todd Smith: Kevin McMurray Trackingsharks.com,,,,,,,,


## Injury Cleaning

In [14]:
df['Injury'].value_counts() 
# - Must clean Injury in to 2 answears only, 'yes' or 'no'.

Injury
FATAL                                                                                                     863
Foot bitten                                                                                               100
Survived                                                                                                   97
No injury                                                                                                  85
Leg bitten                                                                                                 81
                                                                                                         ... 
"Lost leg"                                                                                                  1
FATAL, body not recovered but shark was caught with the man's loincloth in its gut shortly afterwards.      1
FATAL, leg stripped of flesh                                                                                1
FAT

In [15]:
# - Injury column cleaning and classification
# - Replacing all types of injuries with these new classes we create from the coresponding values 
# we had before in Injury column:

df['Injury'] = df['Injury'].fillna('Unknown').astype(str).str.lower().str.strip()


def classify_injury(text):
    text = str(text).lower().strip()

    # Clear 'no injury' cases
    if any(word in text for word in ['no injury', 'uninjured','unharmed', 'none']):
        return 'No Injury'

    # If survived or recovered, do NOT count as fatal
    if any(word in text for word in ['survived', 'recovered', 'escaped', 'rescued', 'lived']):
        return 'Severe Wounds'

    # Fatal cases
    if any(word in text for word in [
        'body not recovered', 'fatal', 'died', 'death', 'deceased',
        'human remains', 'presumed dead'
    ]):
        return 'Fatal Wounds'

    # Minor cases
    if any(word in text for word in [
        'minor', 'scratch', 'abrasion', 'bruise', 'small', 'superficial', 'cut'
    ]):
        return 'Minor Wounds'

    # Severe cases
    if any(word in text for word in [
        'amputation', 'severe', 'major', 'deep', 'critical', 'massive',
        'laceration', 'bite', 'injured', 'tissue loss'
    ]):
        return 'Severe Wounds'

    return 'Unknown'
df['Injury_Class'] = df['Injury'].apply(classify_injury)

In [16]:
df['Injury_Class'].value_counts() 
# - Achieved less then 10% of unknowns in Injury_Class column
# - Some injury tipes could not be classified, this is because the reports are to vague in some cases so 
# we set it to unknown.

Injury_Class
Unknown          2463
Severe Wounds    1736
Fatal Wounds     1435
No Injury         884
Minor Wounds      532
Name: count, dtype: int64

## Fatal Cleaning

In [5]:
# Fatal Y/N column cleaning

df['Fatal Y/N'] = df['Fatal Y/N'].astype(str).str.strip().str.upper()

df['Fatal Y/N'] = df['Fatal Y/N'].replace({
    'UNKNOWN': 'N',  
    'F': 'N',
    'M': 'N',
    'NAN': 'N',
    'NQ': 'N',
    '2017': 'N',
    'Y X 2': 'Y',
})

# - Since the shark attack report claims to only have fatal and non fatal attacks, 
# we will consider all unknowns as non-fatal for the purpose of this analysis.

In [6]:
df['Fatal Y/N'].value_counts(dropna=False) # We achieved less then 10% of unknowns in Fatal Y/N column

Fatal Y/N
N    5566
Y    1484
Name: count, dtype: int64