In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/shark-attacks/attacks.csv')

In [None]:
df.head()
# right off, it seems the Case Number and Year columns are redundant... we'll drop them

In [None]:
df.drop(['Case Number', 'Year'], axis =1, inplace= True)

In [None]:
df.info()

In [None]:
# similarly, the columns : 'Investigator' 'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2'
# and 'original order' contain either redundant or irrelevant information for prediction 
# purposes. The columns: 'Investigator' 'pdf', 'href formula', 'href',
# would be helpful to dig into the dataset more
# For exploratory purposes, we drop these columns.
df.columns

In [None]:
df.drop(['Name','Investigator or Source','pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'], axis = 1, inplace = True)

In [None]:
# this dataset is full of null values. We cannot impute as the data is text based. 
df.isnull().sum()/len(df)
#Let's see where the null values lie in the data

In [None]:
sns.heatmap(df.isnull(), yticklabels = False,cbar = False, cmap = 'viridis')
# Yellow is NaN here. A huge swath of this data is missing. Let's first just remove all rows
 # that only contain the enrty Nan

In [None]:
df.dropna(axis = 0, how = 'all', inplace = True)

In [None]:
sns.heatmap(df.isnull(), yticklabels = False,cbar = False, cmap = 'viridis')
# Now it is clear that the 'Age', 'Time' and 'Species' columns are missing a lot information

In [None]:
# There are now at least two approaches. The first approach is to delete the 'Age','Time' and
#'Species' column, then delete remaining rows with NaN and proceed. 

# The species and time columns are interesting features...is there a shark that attacks 
#humans more frequently? What time do these attacks occur?

# For now, we'll take the first approach for sanity sake!

In [None]:
# copies of dataframe up until now
# data is dropped according to discussion above
df1 = df.copy(deep = True)
df1.drop(['Age', 'Time', 'Species '], axis = 1, inplace = True)
df1.dropna(axis = 0, how = 'any', inplace = True)

In [None]:
#df1 now has no missing data (it is not clean though!)
sns.heatmap(df1.isnull(), yticklabels= False, cbar = False, cmap = 'viridis')

In [None]:
df1.info()
# 4601 entries by first removing 'Age', 'Time' and 'Species' columns

# Cleaning up the data features columns...

In [None]:
# fatal data column is not neatly classified
sns.countplot(x = df1['Fatal (Y/N)'], data = df1)

In [None]:
# appears that six non-fatal encounters are classified as ' N'. we need to group 
# these with the 'N' category and get rid of the 'UNKNOWN' and '2017' entrys.
df1['Fatal (Y/N)'].value_counts()

In [None]:
df1[df1['Fatal (Y/N)'] == ' N']['Fatal (Y/N)'].iloc[0:5]

In [None]:
# replacing ' N' values with 'N' in fatal column
if df1[df1['Fatal (Y/N)'] == ' N']['Fatal (Y/N)'].iloc[0]:
    df1.replace(to_replace = df1[df1['Fatal (Y/N)'] == ' N']['Fatal (Y/N)'].iloc[0],\
    value = 'N',inplace = True)
    
df1['Fatal (Y/N)'].value_counts()

In [None]:
df1[df1['Fatal (Y/N)'] == 'UNKNOWN']['Fatal (Y/N)'].index.values

In [None]:
# As we do not know what the 'UNKNOWN' and '2017' labels correspond to, we'll have to
# drop these rows
df1.drop(index = df1[df1['Fatal (Y/N)'] == 'UNKNOWN']['Fatal (Y/N)'].index.values,\
        inplace = True)

df1.drop(index = df1[df1['Fatal (Y/N)'] == '2017']['Fatal (Y/N)'].index.values,\
        inplace = True)

df1['Fatal (Y/N)'].value_counts()

In [None]:
sns.countplot(x = 'Fatal (Y/N)', data = df1)

In [None]:
# dummy variables for Fatal (Y/N) column
fatal = pd.get_dummies(df1['Fatal (Y/N)'], drop_first = True)
df1.drop('Fatal (Y/N)', axis = 1, inplace = True)
df1 = pd.concat([df1, fatal], axis = 1)

In [None]:
# We can use the same approach for the 'Type', 'Activity', 'Sex' and 'Area' columns:

#we'll work on the 'Sex column first'
sns.countplot(x = 'Sex ', data = df1)

In [None]:
df1['Sex '].value_counts()

In [None]:
if df1[df1['Sex '] == 'M ']['Sex '].iloc[0]:
    df1.replace(to_replace = df1[df1['Sex '] == 'M ']['Sex '].iloc[0], value = 'M',\
               inplace = True)
    
df1['Sex '].value_counts()

In [None]:
#dropping remaing ambiguous 'Sex' labels
df1.drop(df1[df1['Sex '] == 'lli']['Sex '].index.values, inplace = True)
df1.drop(df1[df1['Sex '] == '.']['Sex '].index.values, inplace = True)
df1.drop(df1[df1['Sex '] == 'N']['Sex '].index.values, inplace = True)
df1['Sex '].value_counts()

In [None]:
sns.countplot(x = 'Sex ', data = df1, hue = 'Y')

In [None]:
sex = pd.get_dummies(df1['Sex '], drop_first = True)
df1.drop('Sex ', axis = 1, inplace = True)
df1 = pd.concat([df1, sex], axis = 1)

In [None]:
# moving on to the 'Type' column
df1['Type'].value_counts()
# 'Boat' and 'Boating' columns can be merged

In [None]:
#merging Boat and Boating columns
if df1[df1['Type'] == 'Boat']['Type'].iloc[0]:
    df1.replace(to_replace = df1[df1['Type'] == 'Boat']['Type'].iloc[0], value = 'Boating',\
               inplace = True)
df1['Type'].value_counts()
#we'll keep the invalid column as we lack additional information to change it

In [None]:
sns.countplot(x = 'Type', data = df1, hue = 'Y')
# sea diaster entries are most fatal. Possibly due to time in the water/presence of blood
# number of people...

In [None]:
type = pd.get_dummies(df1['Type'], drop_first= True)
df1.drop('Type', axis = 1, inplace = True)
df1 = pd.concat([df1, type], axis = 1)

In [None]:
# Now the 'Activity column'
# some simple typos to correct and then some larger explainations...

# first we make everything lowercase
for i in range(len(df1['Activity'])):
    df1['Activity'].iloc[i] = df1['Activity'].iloc[i].lower()

In [None]:
# capturing and classifiying as many 'activities' as possible

# some activities are lumped together for processing 
for i in range(len(df1['Activity'])):
    
    if 'surfing' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'surfing', inplace = True)
    elif 'surfing,' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'surfing', inplace = True)
    elif 'surfboard' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'surfing', inplace = True)
    elif 'swimming' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'swimming', inplace = True)
    elif 'swimming,' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'swimming', inplace = True)
    elif 'bathing' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'swimming', inplace = True)
    elif 'floating' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'swimming', inplace = True)
    elif 'water' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'swimming', inplace = True)
    elif 'fishing' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'fishing', inplace = True)
    elif 'fishing,' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'fishing', inplace = True)
    elif 'wading' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'wading', inplace = True)
    elif 'standing' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'wading', inplace = True)
    elif 'boogie' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'body boarding', inplace = True)
    elif 'body-boarding' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'body boarding', inplace = True)
    elif 'bodyboarding' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'body boarding', inplace = True)
    elif 'spearfishing' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'spearfishing', inplace = True)
    elif 'spearfishing,' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'spearfishing', inplace = True)
    elif 'diving' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'diving', inplace = True)
    elif 'freediving' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'diving', inplace = True)
    elif 'skindiving' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'diving', inplace = True)
    elif 'snorkeling' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'snorkeling', inplace = True)
    elif 'surf-skiing' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'surf skiing', inplace = True)
    elif 'skiing' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'surf skiing', inplace = True)
    elif 'canoeing' in df1['Activity'].iloc[i].split():
        df1.replace(to_replace = df1['Activity'].iloc[i],value = 'kayaking', inplace = True)


In [None]:
pd.set_option('max_rows', None)
print(df1['Activity'].value_counts()[:13].sum())
print('\n')
df1['Activity'].value_counts()[:13].sum()/df1['Activity'].value_counts().sum()
#we'll drop all activities that have less than ten entrys. 
#We're keeping 88% with this cut off

In [None]:
act_list = df1['Activity'].value_counts()[:13].index.values

In [None]:
drop_act = []
for i in range(len(df1)):
    if df1['Activity'].iloc[i] not in act_list: 
        drop_act = np.append(drop_act,  int(df1['Activity'].index[i]))

drop_act = list(map(int, drop_act))

In [None]:
df1.drop(index = drop_act, inplace = True)

In [None]:
plt.figure(figsize = (15,5))
sns.countplot(x = 'Activity', data = df1)

In [None]:
plt.figure(figsize = (15,5))
sns.countplot(x = 'Activity', data = df1, hue = 'Y')
# swimming (and the activities that are lumped there) have the greatest fatality rate

In [None]:
act_dum = pd.get_dummies(df1['Activity'], drop_first=True)
df1.drop('Activity', axis =1, inplace = True)
df1 = pd.concat([df1, act_dum], axis=1)

In [None]:
#let's remove the injury as it does not help with predictive properties. Also, we'll drop 
# 'Country' and 'Location' Columns and focus on the 'Area'
df1.drop(['Injury', 'Country', 'Location'], axis = 1, inplace = True)

In [None]:
# Again, collecting 'Area' entrys with value counts greater than 9
area_list = df1['Area'].value_counts()[:32].index.values

In [None]:
drop_area = []
for i in range(len(df1)):
    if df1['Area'].iloc[i] not in area_list: 
        drop_area = np.append(drop_area,  int(df1['Area'].index[i]))

drop_area = list(map(int, drop_area))

In [None]:
df1.drop(index = drop_area, inplace = True)

In [None]:
plt.figure(figsize = (50,5))
sns.countplot(x = 'Area', data = df1)

In [None]:
plt.figure(figsize = (50,5))
sns.countplot(x = 'Area', data = df1, hue = 'Y')
# New South Wales has the greatest fatality rate per shark attack.

In [None]:
area_dum = pd.get_dummies(df1['Area'], drop_first=True)
df1.drop('Area', axis = 1, inplace = True)
df1 = pd.concat([df1, area_dum], axis = 1)

# Now for the Date Column...

In [None]:
# We'll keep only the year

# year column
df1['Year'] = df1['Date'].apply(lambda x:x.split('-')[0])

In [None]:
#correcting the nonyear values

wrong_year_val = []
wrong_year_ind = []
for i in range(len(df1['Year'])):
    if len(df1['Year'].iloc[i]) > 4:
        wrong_year_val = np.append(wrong_year_val, df1['Date'].iloc[i])
        wrong_year_ind = np.append(wrong_year_ind, df1['Date'].index[i])
    elif len(df1['Year'].iloc[i]) < 4:
        wrong_year_val = np.append(wrong_year_val, df1['Date'].iloc[i])
        wrong_year_ind = np.append(wrong_year_ind, df1['Date'].index[i])

wrong_year_ind = list(map(int, wrong_year_ind))

In [None]:
#replacing nonyear values

drop_wrongdf = pd.DataFrame(wrong_year_val)
split_year = drop_wrongdf[0].apply(lambda x:x.split('-'))

for i in range(len(split_year)):
    df1['Year'].at[wrong_year_ind[i]] = split_year[i][-1]

In [None]:
#final cleaning and dropping remaining erroneous values
df1['Year'] = df1['Year'].apply(lambda x:x.split()[-1])
df1.drop(df1[df1['Year'].map(len)!=4].index, inplace = True)
df1.drop(df1[df1['Year']> '2021'].index, inplace = True)
df1['Year'] = list(map(int, df1['Year']))
len(df1)

In [None]:
# reported shark attacks have certainly increased...what happened in 1905?
plt.figure(figsize=(100,5))
year_order = sorted(df1['Year'].unique(), reverse = True)
sns.countplot(x = 'Year',order = year_order, data = df1)

In [None]:
#removing the 'Date' column
df1.drop('Date', axis =1, inplace = True)

In [None]:
df1.columns

In [None]:
# we are now ready to train a classification model! 
df1.select_dtypes(['object']).columns

In [None]:
# We'll attempt to predict whether or not an attack will be fatal
from sklearn.model_selection import train_test_split
X = df1.drop('Y', axis = 1).values
y = df1['Y'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Training Models

# Logistic Regression

In [None]:
# We'll train three classification models: Logistic Regression, Decision tree classifier 
# and random forest classifer.

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver ='liblinear')
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, lr_pred))
print('\n')
print(confusion_matrix(y_test, lr_pred))
print('\n')
print(accuracy_score(y_test, lr_pred))

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_pred = dtc.predict(X_test)

In [None]:
print(classification_report(y_test, dtc_pred))
print('\n')
print(confusion_matrix(y_test, dtc_pred))
print('\n')
print(accuracy_score(y_test, dtc_pred))

# Random Forest Classifer

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

In [None]:
print(classification_report(y_test, rfc_pred))
print('\n')
print(confusion_matrix(y_test, rfc_pred))
print('\n')
print(accuracy_score(y_test, rfc_pred))