In [1]:
# import the essentials

import pandas as pd
import numpy as np
pd.options.display.max_rows = 6000

In [2]:
# let's take a look at how big of a train wreck we got.

sharks = pd.read_csv('GSAF5.csv')
sharks.shape

(5992, 24)

In [3]:
# almost 6000 records - lit.

sharks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [4]:
# look at those useless spaces after some columns names - they trying me. let's lose those and then how about we format 
# the others  to something more standard that still maintains the integrity of the original names. lowercase letters, 
# underscores for spaces, and removing other special characters seems like a chill start.

sharks.columns = sharks.columns.str.rstrip(' ').str.replace(' ', '_').str.lower().str.replace('\W', '', regex = True)
sharks.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal_yn', 'time',
       'species', 'investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number1', 'case_number2', 'original_order', 'unnamed_22',
       'unnamed_23'],
      dtype='object')

In [5]:
# sweet, let's get a preview of the actual data in these columns.

sharks.iloc[:, :12].head()

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark


In [6]:
sharks.iloc[:, 12:].head()

Unnamed: 0,fatal_yn,time,species,investigator_or_source,pdf,href_formula,href,case_number1,case_number2,original_order,unnamed_22,unnamed_23
0,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [7]:
# so i like grouping columns into meaningful chunks so i'm just gonna do some reordering.

sharks = sharks[['case_number', 'case_number1', 'case_number2', 'original_order', 'date', 'year', 'time', 'country', 
                 'area', 'location', 'name', 'sex', 'age', 'activity', 'type', 'injury', 'fatal_yn', 'species',
                 'investigator_or_source', 'pdf', 'href', 'href_formula', 'unnamed_22', 'unnamed_23']]

sharks.iloc[:, :12].head()

Unnamed: 0,case_number,case_number1,case_number2,original_order,date,year,time,country,area,location,name,sex
0,2016.09.18.c,2016.09.18.c,2016.09.18.c,5993,18-Sep-16,2016,13h00,USA,Florida,"New Smyrna Beach, Volusia County",male,M
1,2016.09.18.b,2016.09.18.b,2016.09.18.b,5992,18-Sep-16,2016,11h00,USA,Florida,"New Smyrna Beach, Volusia County",Chucky Luciano,M
2,2016.09.18.a,2016.09.18.a,2016.09.18.a,5991,18-Sep-16,2016,10h43,USA,Florida,"New Smyrna Beach, Volusia County",male,M
3,2016.09.17,2016.09.17,2016.09.17,5990,17-Sep-16,2016,,AUSTRALIA,Victoria,Thirteenth Beach,Rory Angiolella,M
4,2016.09.15,2016.09.16,2016.09.15,5989,16-Sep-16,2016,,AUSTRALIA,Victoria,Bells Beach,male,M


In [8]:
sharks.iloc[:, 12:].head()

Unnamed: 0,age,activity,type,injury,fatal_yn,species,investigator_or_source,pdf,href,href_formula,unnamed_22,unnamed_23
0,16.0,Surfing,Unprovoked,Minor injury to thigh,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,,
1,36.0,Surfing,Unprovoked,Lacerations to hands,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,,
2,43.0,Surfing,Unprovoked,Lacerations to lower leg,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,,
3,,Surfing,Unprovoked,Struck by fin on chest & leg,N,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,,
4,,Surfing,Unprovoked,No injury: Knocked off board by shark,N,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,,


In [9]:
# there might be some repeating columns, which is chill. just gotta verify before dropping them.

comparisons = []
for i in range(len(sharks.columns)):
    for j in range(i + 1, len(sharks.columns)):
        comparisons_i = [sharks.columns[i], sharks.columns[j]]
        comparisons.append(comparisons_i)

match_ratios = []
for i in comparisons:
    match_ratios_i = [(sharks[i[0]] == sharks[i[1]]).sum() / len(sharks) * 100, i[0] + ' - ' + i[1]]
    match_ratios.append(match_ratios_i)
match_ratios.sort(reverse = True)
match_ratios[:10]

[[99.96662216288385, 'case_number - case_number2'],
 [99.81642189586115, 'case_number1 - case_number2'],
 [99.78304405874499, 'case_number - case_number1'],
 [99.09879839786382, 'href - href_formula'],
 [0.23364485981308408, 'activity - type'],
 [0.11682242990654204, 'area - location'],
 [0.05006675567423231, 'case_number2 - date'],
 [0.05006675567423231, 'case_number1 - date'],
 [0.05006675567423231, 'case_number - date'],
 [0.016688918558077435, 'time - injury']]

In [10]:
# let's get rid of the columns that are basically wastes of space for analysis.

sharks.drop(columns = ['case_number1', 'case_number2', 'href_formula'], inplace = True)
sharks.head()

Unnamed: 0,case_number,original_order,date,year,time,country,area,location,name,sex,...,activity,type,injury,fatal_yn,species,investigator_or_source,pdf,href,unnamed_22,unnamed_23
0,2016.09.18.c,5993,18-Sep-16,2016,13h00,USA,Florida,"New Smyrna Beach, Volusia County",male,M,...,Surfing,Unprovoked,Minor injury to thigh,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,,
1,2016.09.18.b,5992,18-Sep-16,2016,11h00,USA,Florida,"New Smyrna Beach, Volusia County",Chucky Luciano,M,...,Surfing,Unprovoked,Lacerations to hands,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,,
2,2016.09.18.a,5991,18-Sep-16,2016,10h43,USA,Florida,"New Smyrna Beach, Volusia County",male,M,...,Surfing,Unprovoked,Lacerations to lower leg,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,,
3,2016.09.17,5990,17-Sep-16,2016,,AUSTRALIA,Victoria,Thirteenth Beach,Rory Angiolella,M,...,Surfing,Unprovoked,Struck by fin on chest & leg,N,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,,
4,2016.09.15,5989,16-Sep-16,2016,,AUSTRALIA,Victoria,Bells Beach,male,M,...,Surfing,Unprovoked,No injury: Knocked off board by shark,N,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,,


In [11]:
# so about all those NaNs from the get-go.

sharks.isna().sum().sort_values(ascending = False)[:10] / len(sharks) * 100

unnamed_22    99.983311
unnamed_23    99.966622
time          53.621495
species       48.965287
age           44.742991
sex            9.462617
activity       8.795060
location       8.277704
area           6.708945
name           3.337784
dtype: float64

In [12]:
# look at those ratios, breh. time to drop all that negativity.

sharks.drop(columns = ['unnamed_22', 'unnamed_23'], inplace = True)
sharks.head()

Unnamed: 0,case_number,original_order,date,year,time,country,area,location,name,sex,age,activity,type,injury,fatal_yn,species,investigator_or_source,pdf,href
0,2016.09.18.c,5993,18-Sep-16,2016,13h00,USA,Florida,"New Smyrna Beach, Volusia County",male,M,16.0,Surfing,Unprovoked,Minor injury to thigh,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
1,2016.09.18.b,5992,18-Sep-16,2016,11h00,USA,Florida,"New Smyrna Beach, Volusia County",Chucky Luciano,M,36.0,Surfing,Unprovoked,Lacerations to hands,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2016.09.18.a,5991,18-Sep-16,2016,10h43,USA,Florida,"New Smyrna Beach, Volusia County",male,M,43.0,Surfing,Unprovoked,Lacerations to lower leg,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
3,2016.09.17,5990,17-Sep-16,2016,,AUSTRALIA,Victoria,Thirteenth Beach,Rory Angiolella,M,,Surfing,Unprovoked,Struck by fin on chest & leg,N,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
4,2016.09.15,5989,16-Sep-16,2016,,AUSTRALIA,Victoria,Bells Beach,male,M,,Surfing,Unprovoked,No injury: Knocked off board by shark,N,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...


In [13]:
# let's peep on them dtypes.

sharks.dtypes

case_number               object
original_order             int64
date                      object
year                       int64
time                      object
country                   object
area                      object
location                  object
name                      object
sex                       object
age                       object
activity                  object
type                      object
injury                    object
fatal_yn                  object
species                   object
investigator_or_source    object
pdf                       object
href                      object
dtype: object

In [14]:
# so i'm cool with everything except fatal_yn 'cause that can totally be a boolean

sharks['fatal_yn'].value_counts()

N          4315
Y          1552
UNKNOWN      94
 N            8
#VALUE!       1
n             1
N             1
F             1
Name: fatal_yn, dtype: int64

In [15]:
# jesus christ..

sharks.loc[sharks['fatal_yn'] == 'UNKNOWN', 'fatal_yn'] = None
sharks.loc[sharks['fatal_yn'] == '#VALUE!', 'fatal_yn'] = None
sharks.loc[sharks['fatal_yn'] == 'F', 'fatal_yn'] = False
sharks.loc[(sharks['fatal_yn'].notna()) & (sharks['fatal_yn'].str.contains('\s*[yY]\s*', regex = True)), 'fatal_yn'] = True
sharks.loc[(sharks['fatal_yn'].notna()) & (sharks['fatal_yn'].str.contains('\s*[nN]\s*', regex = True)), 'fatal_yn'] = False

sharks['fatal_yn'].value_counts()

False    4326
True     1552
Name: fatal_yn, dtype: int64

In [16]:
sharks['fatal_yn'] = sharks['fatal_yn'].astype('bool')
sharks = sharks.rename(columns = {'fatal_yn': 'fatal'})

sharks.dtypes

case_number               object
original_order             int64
date                      object
year                       int64
time                      object
country                   object
area                      object
location                  object
name                      object
sex                       object
age                       object
activity                  object
type                      object
injury                    object
fatal                       bool
species                   object
investigator_or_source    object
pdf                       object
href                      object
dtype: object

In [17]:
# sweet, so now that we know values can be anything, i'm gonna check the values across the
# columns to determine if any more cleaning is needed.

sharks['case_number'].value_counts()

1915.07.06.a.R        2
1990.05.10            2
2013.10.05            2
2012.09.02.b          2
1907.10.16.R          2
2005.04.06            2
1966.12.26            2
1962.06.11.b          2
1983.06.15            2
2009.12.18            2
2014.08.02            2
1980.07.00            2
1920.00.00.b          2
1923.00.00.a          2
1913.08.27.R          2
2006.09.02            2
1944.03.26.a          1
1952.05.27            1
1927.11.03            1
1951.08.00            1
1964.02.17.R          1
2008.02.15            1
1994.10.18            1
1943.03.21            1
1942.09.30            1
1999.09.16            1
1990.03.24            1
1930.08.16            1
1954.06.29            1
1919.00.00            1
1989.10.14            1
1966.01.25            1
1980.01.13            1
1959.07.00.b          1
1950.08.06            1
1985.01.16            1
1988.12.15            1
1986.11.19            1
1956.07.28            1
1981.10.19.a          1
1999.08.16            1
2001.09.16      

In [18]:
# looks good.

sharks['original_order'].value_counts()

5661    2
569     2
3847    2
5739    2
2047    1
2716    1
4755    1
2708    1
661     1
4759    1
2712    1
665     1
4763    1
4767    1
669     1
2704    1
2720    1
673     1
4771    1
2724    1
677     1
4775    1
2728    1
657     1
4751    1
4779    1
4735    1
4723    1
2676    1
629     1
4727    1
2680    1
633     1
4731    1
2684    1
637     1
2688    1
653     1
641     1
4739    1
2692    1
645     1
4743    1
2696    1
649     1
4747    1
2700    1
681     1
2732    1
2672    1
729     1
717     1
4815    1
2768    1
721     1
4819    1
2772    1
725     1
4823    1
2776    1
4827    1
685     1
2780    1
733     1
4831    1
2784    1
737     1
4835    1
2788    1
741     1
4839    1
2764    1
4811    1
713     1
2760    1
4783    1
2736    1
689     1
4787    1
2740    1
693     1
4791    1
2744    1
697     1
4795    1
2748    1
701     1
4799    1
2752    1
705     1
4803    1
2756    1
709     1
4807    1
625     1
4719    1
745     1
2592    1
2580    1
533     1


In [19]:
# looks good.

sharks['date'].value_counts()

1957                                                                11
1942                                                                 9
1956                                                                 8
1941                                                                 7
1950                                                                 7
1958                                                                 7
No date                                                              6
1949                                                                 6
28-Jul-95                                                            5
1940                                                                 5
1970s                                                                5
No date, Before 1963                                                 5
05-Oct-03                                                            5
Aug-56                                                               5
1959  

In [20]:
sharks['date'] = sharks['date'].str.strip(' \t.')

# months = {1: [1, '01', 'Jan', 'January'],
#           2: [2, '02', 'Feb', 'February'],
#           3: [3, '03', 'Mar', 'March'],
#           4: [4, '04', 'Apr', 'April'],
#           5: [5, '05', 'May', 'May'],
#           6: [6, '06', 'Jun', 'June'],
#           7: [7, '07', 'Jul', 'July'],
#           8: [8, '08', 'Aug', 'August'],
#           9: [9, '09', 'Sep', 'September'],
#           10: [10, '10', 'Oct', 'October'],
#           11: [11, '11', 'Nov', 'November'],
#           12: [12, '12', 'Dec', 'December']}

In [22]:
# rough...

lregex = '^(?:Report(e)*d *(in)*(to have taken place in)*|No date.*|Before.*|Ca *\. *(mid-)*|Circa|"(?:During|Before) the war"|Said to be 1941-1945, more likely|Some time between|(?:[Ee]arly|[Mm]id|[Ll]ate)* *(1600s Reported *)*(?:[Ss]ummer|[Ff]all|[Ww]inter|[Ss]pring)* *(?:of|-)*|A few years before.*|Letter dated |"Anniversary Day" |Last incident of ) *'
rregex = ' *(?:"Bitten last weekend|Reported( to have happened  "on the weekend")*|\.00|\.R|\.(?:a|b)|\(*\?\)*|(?:-|`)|\(soon after the close of the Spanish-American War\)|During the Seige of Ladysmith|\(same day as  [0-9]{4}\.00\.00\.f\)|\(probably happened Ca\. 1843/1844\)|, but took place around 1868|in Hong Kong)$'
pattern = f'{lregex}|{rregex}'

sharks['date_cleaned'] = sharks.loc[sharks['date'].str.contains(pattern, regex = True), 'date'].str.replace(pattern, '', regex = True)

# sharks.loc[sharks['date_cleaned'].str.contains('^[0-9]{2}-[A-z]{3}-[0-1]{1}[0-9]{1}$', regex = True), 'year2'] = '20' + sharks['date_cleaned'].str[-2:]
# sharks.loc[sharks['date_cleaned'].str.contains('^[0-9]{2}-[A-z]{3}-[2-9]{1}[0-9]{1}$', regex = True), 'year2'] = '19' + sharks['date_cleaned'].str[-2:]

sharks[['date', 'date_cleaned']].sort_values(by = 'date')

# CHECKING REGEX CHANGES

# new_values = sharks.loc[sharks['date'].str.contains(pattern, regex = True), 'date'].str.replace(pattern, '', regex = True).rename('new_date')
# old_values = sharks.loc[sharks['date'].str.contains(pattern, regex = True), 'date']
# comparisons = pd.concat([old_values, new_values], axis = 1)
# print(comparisons.shape[0])
# comparisons.sort_values(by = 'date')

  import sys


Unnamed: 0,date,date_cleaned
5750,"""Anniversary Day"" 22-Jan-1850 or 1852",22-Jan-1850 or 1852
5951,"""Before the war""",
5950,"""During the war"" 1943-1945",1943-1945
1648,01-Apr-02,01-Apr-02
1167,01-Apr-07,01-Apr-07
4714,01-Apr-34,01-Apr-34
4713,01-Apr-34,01-Apr-34
4712,01-Apr-34,01-Apr-34
3777,01-Apr-60,01-Apr-60
3111,01-Apr-72,01-Apr-72


In [None]:
#.str.replace('^(?:Reported|No date,*|Ca.*|Circa.*|n) *', '', regex = True)
#.str.replace(' *(?:Reported|\(?\?\)?)', '')
#.b-`.a').str.rstrip('.00').str.rstrip(' Reported to have happened  "on the weekend"').str.rstrip(', but took place around 1868').str.rstrip(' (probably happened Ca. 1843/1844')
# sharks.loc[sharks['date'].str.contains('^[0-9]{4}$', regex = True), 'year2'] = sharks['date']
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{3}-[0-9]{2}$', regex = True), 'year2'] = '19' + sharks['date'].str[-2:]
# sharks.loc[sharks['date'].str.contains('^[A-z]{3}-[A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[A-z]{3}-[0-9]{2}$', regex = True), 'year2'] = '19' + sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[A-z]{3} [0-9]{4}$', regex = True), 'year2'] = '19' + sharks['date'].str[-2:]
# sharks.loc[sharks['date'].str.contains('^[A-z]{4} [0-9]{4}$', regex = True), 'year2'] = '19' + sharks['date'].str[-2:]
# sharks.loc[sharks['date'].str.contains('^[A-z]{7} [0-9]{4}$', regex = True), 'year2'] = '19' + sharks['date'].str[-2:]
# sharks.loc[sharks['date'].str.contains('^[A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{2}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{1}-[A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}- [A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{3}- [0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{3} [0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{3}--[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{4}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{5}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{5} [0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{4}.[0-9]{2}.[0-9]{2}$', regex = True), 'year2'] = sharks['date'].str[:4]
# sharks.loc[sharks['date'].str.contains('^[0-9]{4} \(same day as  [0-9]{4}.00.00.f\)$', regex = True), 'year2'] = sharks['date'].str[:4]
# sharks.loc[sharks['date'].str.contains('^v-[0-9]{2}$', regex = True), 'year2'] = '19' + sharks['date'].str[-2:]
# sharks.loc[sharks['date'].str.contains('^v- or [A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = '19' + sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^(?:Summer|Fall|Winter|Spring|Mid|Early|Late)-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^(?:Summer|Fall|Winter|Spring|Mid|Early|Late) [0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^(?:Summer|Fall|Winter|Spring|Mid|Early|Late) of [0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^(?:Summer|Fall|Winter|Spring|Mid|Early|Late) [A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^(?:Summer|Fall|Winter|Spring|Mid|Early|Late) [A-z]{3} [0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^(?:Early|Late) [A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{3}-[0-9]{4} or [0-9]{2}-[A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2} or [0-9]{2}-[A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}-[A-z]{3}-[0-9]{4} to [0-9]{2}-[A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{4} or [0-9]{2}-[A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[A-z]{3}-[0-9]{4} or [A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[A-z]{3} or [A-z]{3}-[0-9]{4}$', regex = True), 'year2'] = sharks['date'].str[-4:]
# sharks.loc[sharks['date'].str.contains('^[0-9]{1} A.D$', regex = True), 'year2'] = '-' + sharks['date'].str[:1]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2} A.D$', regex = True), 'year2'] = '-' + sharks['date'].str[:1]
# sharks.loc[sharks['date'].str.contains('^[0-9]{2}  A.D$', regex = True), 'year2'] = '-' + sharks['date'].str[:1]
# sharks.loc[sharks['date'].str.contains('^[0-9]{3} A.D$', regex = True), 'year2'] = '-' + sharks['date'].str[:3]
# sharks.loc[sharks['date'].str.contains('^[0-9]{3} B.C$', regex = True), 'year2'] = '-' + sharks['date'].str[:3]
# sharks.loc[sharks['date'].str.contains('^[0-9]{3}.B.C$', regex = True), 'year2'] = '-' + sharks['date'].str[:3]

# print(sharks.loc[(sharks['year2'].isna()) & (sharks['date'].str.contains('^(?:Before|After|Between) ', regex = True) == False), 'date'].value_counts().sum())
# print(sharks.loc[(sharks['year2'].isna()) & (sharks['date'].str.contains('^(?:Before|After|Between) ', regex = True) == False), 'date'].value_counts()[: 60] / len(sharks) * 100)
# print(sharks.loc[(sharks['year2'].isna()) & (sharks['date'].str.contains('^(?:Before|After|Between) ', regex = True) == False), 'date'].value_counts()[60: 120] / len(sharks) * 100)