This EDA nb is using occurrence tables e-mailed from Tiago on 9/27 (duplicated, with rotated lat and long columns)

## Imports + Loading In Data

In [302]:
import pandas as pd


In [303]:
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [304]:
# Loading in data w/ rotated coordinates
rep = pd.read_excel('reptilia_occ_genus.xlsm', engine='openpyxl')
syn = pd.read_excel('synapsida_occ_genus.xlsm', engine='openpyxl')
tem = pd.read_excel('temnospondyli_occ_genus.xlsm', engine='openpyxl')

In [305]:
rep['county'].isna().sum()

1495

In [306]:
rep['county'].value_counts()

Apache                    234
Comanche                  152
Rio Arriba                 88
Franklin                   78
Gloucestershire            67
                         ... 
Malmyzh                     1
Anson                       1
Sa√É¬¥ne-et-Loire           1
Huesca                      1
Aliwal North Commonage      1
Name: county, Length: 355, dtype: int64

## Saving Nulls as CSV's

In [307]:
# Seeing how many nulls are in each column will help us decide which columns to drop
nulls_rep = rep.isna().sum()
nulls_syn = syn.isna().sum()
nulls_tem = tem.isna().sum()

# Concatenating the nulls into a single dataframe based on a shared index
# This will allow us to compare the nulls across the three datasets
nulls = pd.concat([nulls_rep, nulls_syn, nulls_tem], axis=1)
nulls.columns = ['Reptilia', 'Synapsida', 'Temnospondyli']

nulls.to_csv('nulls.csv')

In [308]:
# len of nulls df is diff from len of columns in rep
# which means there are columns that are not shared
len(nulls.index), len(rep.columns)

(119, 94)

In [309]:
# Finding which columns rep, syn, and tem do not share
# Crude code, just to see if there are any differences, not finding all differences
# Because any not-shared columns wouldn't be included in the analysis anyway

if rep.columns.all() == syn.columns.all() == tem.columns.all():
    print('All columns are the same')
else:
    diff = rep.columns.difference(syn.columns).difference(tem.columns)
    print(diff)

Index(['abund_in_sediment', 'artifacts', 'component_comments', 'concentration',
       'fossilsfrom1', 'fossilsfrom2', 'lagerstatten', 'lithadj2', 'localbed',
       'localsection', 'minor_lithology2', 'orientation', 'reference_no.x',
       'temporal_resolution'],
      dtype='object')


## Confirming that these are Datasets w/ duplicated rows (eda_duplicates)

In [310]:
# Loading in datasets that I had previously duplicated (i.e., the output of eda_duplicates.ipynb)
rep_old = pd.read_csv('occurrence_tables_1/reptilia_dup.csv')
syn_old = pd.read_csv('occurrence_tables_1/synapsida_dup.csv')
tem_old = pd.read_csv('occurrence_tables_1/temnospondyli_dup.csv')

In [311]:
# Checking that the lengths of the original and new datasets are the same
len(rep_old), len(rep), len(syn_old), len(syn), len(tem_old), len(tem)

(4411, 4411, 3422, 3422, 2247, 2247)

In [312]:
# One last double-check to make sure that the datasets are the same
# All 'abund_values' for "individuals" should be 1
individuals = rep[rep['abund_unit'] == 'individuals']
individuals['abund_value'].value_counts()

1.0    2032
Name: abund_value, dtype: int64

## Filtering to BDNN columns

In [313]:
rep.columns

Index(['id', 'Rotated Lat', 'Rotated Lon', 'occurrence_no', 'pres_mode',
       'preservation_quality', 'common_body_parts', 'abund_value',
       'abund_unit', 'class', 'family', 'genus', 'taxon_environment',
       'life_habit', 'diet', 'early_interval', 'late_interval', 'time_bins',
       'museum', 'collection_no', 'age_max', 'age_min', 'age_median',
       'age_uncer_range', 'lng', 'lat', 'environment', 'occurrence_comments',
       'cc', 'state', 'county', 'formation', 'stratgroup', 'member', 'zone',
       'accepted_rank', 'ref_author', 'ref_pubyr', 'reference_no.x',
       'collection_name', 'collection_subset', 'collection_aka',
       'latlng_basis', 'latlng_precision', 'geogscale', 'geogcomments',
       'paleomodel', 'geoplate', 'paleoage', 'paleolng', 'paleolat',
       'paleomodel2', 'geoplate2', 'paleoage2', 'paleolng2', 'paleolat2',
       'paleomodel3', 'geoplate3', 'paleoage3', 'paleolng3', 'paleolat3',
       'protected', 'stratscale', 'localsection', 'localbed', 'st

In [314]:
# Still waiting on 'niche' info from Arielli
syn_filtered = syn[['Rotated Lat', 'Rotated Lon', 'genus', 'taxon_environment', 
                    'age_max', 'age_min']]
syn_filtered.isna().sum()

Rotated Lat            0
Rotated Lon            0
genus                  0
taxon_environment    310
age_max                0
age_min                0
dtype: int64

In [315]:
# Still waiting on 'niche' info from Arielli
tem_filtered = tem[['Rotated Lat', 'Rotated Lon', 'genus', 'taxon_environment', 
                    'age_max', 'age_min']]
tem_filtered.isna().sum()

Rotated Lat           0
Rotated Lon           0
genus                 0
taxon_environment    14
age_max               0
age_min               0
dtype: int64

In [316]:
# Still waiting on 'niche' info from Arielli
rep_filtered = rep[['Rotated Lat', 'Rotated Lon', 'genus', 'taxon_environment', 
                    'age_max', 'age_min']]
rep_filtered.isna().sum()

Rotated Lat          0
Rotated Lon          0
genus                0
taxon_environment    0
age_max              0
age_min              0
dtype: int64

In [317]:
rep_filtered['taxon_environment'].head(20)

0     terrestrial
1     terrestrial
2     terrestrial
3     terrestrial
4     terrestrial
5     terrestrial
6     terrestrial
7     terrestrial
8     terrestrial
9     terrestrial
10    terrestrial
11    terrestrial
12    terrestrial
13    terrestrial
14    terrestrial
15    terrestrial
16    terrestrial
17    terrestrial
18    terrestrial
19    terrestrial
Name: taxon_environment, dtype: object

In [318]:
rep['environment'].head(20)

0     lacustrine - large
1     lacustrine - large
2           fissure fill
3           fissure fill
4         fluvial indet.
5     terrestrial indet.
6     terrestrial indet.
7           fissure fill
8           fissure fill
9           fissure fill
10          fissure fill
11          fissure fill
12          fissure fill
13          fissure fill
14          fissure fill
15          fissure fill
16          fissure fill
17          fissure fill
18          fissure fill
19          fissure fill
Name: environment, dtype: object

In [319]:
rep_filtered.to_csv('reptilia_processed_data/reptilia_bdnn.csv')
syn_filtered.to_csv('synapsida_processed_data/synapsida_bdnn.csv')
tem_filtered.to_csv('temnospondyli_processed_data/temnospondyli_bdnn.csv')


## Prepping DeepDive columns

In [320]:
rep_deepdive = rep[['genus', 'county', 'state', 'occurrence_no', 'age_max', 'age_min']]
rep_deepdive.head()

Unnamed: 0,genus,county,state,occurrence_no,age_max,age_min
0,Icarosaurus,Hudson,New Jersey,146084,227.0,208.5
1,Rutiodon,Hudson,New Jersey,146085,227.0,208.5
2,Kuehneosuchus,Somerset,England,146086,208.5,201.4
3,Kuehneosaurus,Somerset,England,146087,208.5,201.4
4,Procolophon,Chris Hani,Eastern Cape,147521,251.902,247.2


In [321]:
syn_deepdive = syn[['genus', 'county', 'state', 'occurrence_no','age_max', 'age_min']]
tem_deepdive = tem[['genus', 'county', 'state', 'occurrence_no', 'age_max', 'age_min']]

### 'County' and 'State' Cols EDA

#### Are 'County' Null Counts Reasonable?

In [322]:
rep_deepdive.isna().sum()

# 'county' has a lot of nulls, so I'm going to look back at the older datasets and see if that seems correct or
# if an error occurred when I was duplicating the rows

genus               0
county           1495
state             212
occurrence_no       0
age_max             0
age_min             0
dtype: int64

In [323]:
# Making sure occurrence no has >1 values, since there are duplicate rows (duplicated in eda_duplicates.ipynb)
rep['occurrence_no'].value_counts()

1285964    300
629477      91
1586159     37
902070      33
486411      25
          ... 
830763       1
830777       1
830855       1
830859       1
N84          1
Name: occurrence_no, Length: 3145, dtype: int64

In [324]:
# Loading in my original duplication output
rep_original = pd.read_csv('occurrence_tables_1/reptilia_dup.csv')
rep_original_filtered = rep_original[['genus', 'county', 'state']]
rep_original_filtered.isna().sum()

genus        0
county    1495
state      212
dtype: int64

In [325]:
# Loading in the very first dataset (raw, no rotated lat and long)
rep_old = pd.read_csv('occurrence_tables_1/reptilia_genus_occ_all.csv', encoding = 'latin1')
rep_old_filtered = rep_old[['genus', 'county', 'state']]
rep_old_filtered.isna().sum()

genus       0
county    920
state     197
dtype: int64

#### Subsetting and Saving Just Duplicated Rows

In [326]:
# Saving off just the parts in the deepdive dataset that were duplicated occurrences (i.e., the rows that were duplicated in eda_duplicates.ipynb)
# Trying to see if there's a pattern in the 'county' nulls that I can fix
rep_deepdive_dups = rep_deepdive[rep_deepdive.duplicated(subset=['occurrence_no'], keep=False)]
rep_deepdive_dups.to_csv('reptilia_processed_data/reptilia_deepdive_dups.csv')

syn_deepdive_dups = syn_deepdive[syn_deepdive.duplicated(subset=['occurrence_no'], keep=False)]
syn_deepdive_dups.to_csv('synapsida_processed_data/synapsida_deepdive_dups.csv')

tem_deepdive_dups = tem_deepdive[tem_deepdive.duplicated(subset=['occurrence_no'], keep=False)]
tem_deepdive_dups.to_csv('temnospondyli_processed_data/temnospondyli_deepdive_dups.csv')

#### Fixing Nonsensical Strings in 'County' and 'State'

In [327]:
# Return rows in 'county' and 'state' columns that contain any non-letter characters
import re

# Function for outputting a CSV of unique values in a column that contain special characters
def find_special_chars(df, col, clade):
    df2 = df.copy()
    df2[col] = df2[col].fillna('') # Have to fill empty values with something to use regex
    non_letter = df2[df2[col].str.contains(r'[^a-zA-Z\s\-/\'\,\(\)]', na=False)] # Regex for non-letter characters, excluding hyphens and forward slashes
    unique = non_letter[col].unique()
    # Saving unique values to a CSV named after the clade and column
    unique_df = pd.DataFrame(unique)
    # unique_df.to_csv(f'{clade}_processed_data/{clade}_{col}_special_chars.csv') # Used this already during first run, so commenting out
    # ^ Commenting out so I don't overwrite the files I've already saved. This is just for the first run.
    return unique

In [328]:
# Reptilia
find_special_chars(rep_deepdive, 'county', 'reptilia'), find_special_chars(rep_deepdive, 'state', 'reptilia')

(array(['Valle Fert√É¬≠l', 'Valle F√É¬©rtil', 'Vend√É¬©e',
        'Wei√É¬üenburg-Gunzenhausen', 'S√É¬£o Gabriel', 'L√É¬∂rrach',
        'Portel√É¬¢ndia', 'Perol√É¬¢ndia', 'S√É¬£o Mateus do Sul',
        'Tatu√É¬≠', 'Rebou√É¬ßas', 'Candel√É¬°ria', 'N√É¬ºrnberger Land',
        'Schw√É¬§bisch Hall', 'S√É¬£o Pedro do Sul', 'G√É¬∂ttingen',
        'Sa√É¬¥ne-et-Loire', 'Baden-W√É¬ºrttemberg', 'Th√É¬ºringen',
        'T√É¬ºbingen', 'S√É¬¢one-et-Loire', 'H√É¬©rault',
        'S√É¬£o Jo√É¬£o do Pol√É¬™sine', 'Yuan√¢¬Ä¬ôan',
        'Tarnowskie G√É¬≥ry', 'Arroio do S¬õ', 'Linha S?o Luiz',
        'Toroqu√ø', 'Ribeir?o', 'Rinc?o dos Weiss', 'Acegu√ø',
        'Sesmaria do Pinhal 1',
        'cff Rafael prov√øvel material do MNRJ se sao os mesmo do livro tombo, Alemoa',
        'Cerrito?', 'Sesmaria do Pinhal 2', 'V√ørzea do Agudo',
        '?gua Negra', 'BR 158 federal road', 'Porto Mariante 2',
        'Sao Jose?', 'Sao Jose? (embaixo do cemit?rio coletado por Price)'],
       dtype=object),
 

In [329]:
# Temnospondyli
find_special_chars(tem_deepdive, 'county', 'temnospondyli'), find_special_chars(tem_deepdive, 'state', 'temnospondyli')

(array(['Wei√É¬üeritzkreis', 'Schw√É¬§bisch Hall', 'T√É¬ºbingen',
        'Sa√É¬¥ne-et-Loire', 'S√É¬£o Jo√É¬£o do Pol√É¬™sine',
        'S√É¬£o Gabriel', 'Valle F√É¬©rtil', 'S?o Jer¬ìnimo da Serra',
        'Toroqu¬†'], dtype=object),
 array(['Baden-W√É¬ºrttemberg', 'Sk√É¬•ne', 'Th√É¬ºringen',
        'Azad Jammu & Kashmir', 'Rakovn√É¬≠k', 'Th√É¬ºringer Wald',
        'Th√É¬ºringia', 'Paran√É¬°', 'Baden-W√É¬ºrtteberg'], dtype=object))

In [330]:
# Synapsida
find_special_chars(syn_deepdive, 'county', 'synapsida'), find_special_chars(syn_deepdive, 'state', 'synapsida')

(array(['Valle Fert√É¬≠l', 'Sa√É¬¥ne-et-Loire', 'Wei√É¬üeritzkreis',
        'Schw√É¬§bisch Hall', 'S√É¬£o Gabriel', 'Candel√É¬°ria',
        'Kotel√¢¬Ä¬ônichskii', 'T√É¬ºbingen', '√É¬úr√É¬ºmqi', 'H√É¬©rault',
        'Valle F√É¬©rtil', 'Vila Est¬Éncia Nova', 'Linha S?o Luiz',
        'Linha S?o Luiz ', 'Sesmaria do Pinhal 1', 'V¬†rzea do Agudo',
        'Botucara¬°', 'Rinc?o do Pinhal', 'Linha V¬†rzea 2',
        'Linha V¬†rzea 1', 'Rinc?o do Semi?o ', 'Porto Mariante 2',
        'Rinc?o da Porta', 'Catu¬áaba', 'BR 158 federal road',
        'Linha Fac?o', 'Chiniqu¬†'], dtype=object),
 array(['Franch-Comt√É¬©', 'R√É¬≠o Negro', 'Baden-W√É¬ºrttemberg',
        'Rakovn√É¬≠k', 'Paran√É¬°', 'H√É¬©rault'], dtype=object))

In [331]:
# Special Characters Dictionary for Reptilia
# To map special characters to their correct values

rep_state_dict = {
    "Baden-W√É¬ºrttemberg": "Baden-Wurttemberg",
    "Nieder√É¬∂sterreich": "Niederosterreich",
    "T√É¬°chira": "Tachira",
    "Sa√É¬Øda": "Saida",
    "Sk√É¬•ne": "Skane",
    "Goi√É¬°s": "Goias",
    "Paran√É¬°": "Parana",
    "S√É¬£o Paulo": "Sao Paulo",
    "Cear√É¬°": "Ceara",
    "Guair√É¬°": "Guaira",
    "Bourgogne-Franche-Comt√É¬©": "Bourgogne-Franche-Comte",
    "Baden-W√É¬ºrttenburg": "Baden-Wurttemberg",
    "Nieder√É¬∂stereich": "Niederosterreich",
    "Bayern (Bavaria)": "Bayern",
    "Baden-W√É¬ºrtteberg": "Baden-Wurttemberg",
    "Baden-W√É¬ºrttemburg": "Baden-Wurttemberg",
    "Bansk√É¬° Bystrica": "Banska Bystrica",
    "Graub√É¬ºnden": "Graubunden"
}

rep_county_dict = {
    "Valle Fert√É¬≠l": "Valle Fertil",
    "Valle F√É¬©rtil": "Valle Fertil",
    "Vend√É¬©e": "Vendee",
    "Wei√É¬üenburg-Gunzenhausen": "Weissenburg-Gunzenhausen",
    "S√É¬£o Gabriel": "Sao Gabriel",
    "L√É¬∂rrach": "Lorrach",
    "Portel√É¬¢ndia": "Portelandia",
    "Perol√É¬¢ndia": "Perolandia",
    "S√É¬£o Mateus do Sul": "Sao Mateus do Sul",
    "Tatu√É¬≠": "Tatui",
    "Rebou√É¬ßas": "Reboucas",
    "Candel√É¬°ria": "Candelaria",
    "N√É¬ºrnberger Land": "Nurnberger Land",
    "Schw√É¬§bisch Hall": "Schwabisch Hall",
    "S√É¬£o Pedro do Sul": "Sao Pedro do Sul",
    "G√É¬∂ttingen": "Gottingen",
    "Sa√É¬¥ne-et-Loire": "Saone-et-Loire",
    "Baden-W√É¬ºrttemberg": "Baden-Wurttemberg",
    "Th√É¬ºringen": "Thuringen",
    "T√É¬ºbingen": "Tubingen",
    "S√É¬¢one-et-Loire": "Saone-et-Loire",
    "H√É¬©rault": "Herault",
    "S√É¬£o Jo√É¬£o do Pol√É¬™sine": "Sao Joao do Polesine",
    "Yuan√¢¬Ä¬ôan": "Yuan'an",
    "Tarnowskie G√É¬≥ry": "Tarnowskie Gory",
    "Arroio do S¬õ": "Arroio do So",
    "Linha S?o Luiz": "Linha Sao Luiz",
    "Toroqu√ø": "Toroqua",
    "Ribeir?o": "Ribeirao",
    "Rinc?o dos Weiss": "Rincao dos Weiss",
    "Acegu√ø": "Acegua",
    "Sesmaria do Pinhal 1": "Sesmaria do Pinhal 1",
    "cff Rafael prov√øvel material do MNRJ se sao os mesmo do livro tombo, Alemoa": "cff Rafael provavel material do MNRJ se sao os mesmo do livro tombo, Alemoa",
    "Cerrito?": "Cerrito",
    "Sesmaria do Pinhal 2": "Sesmaria do Pinhal 2",
    "V√ørzea do Agudo": "Varzea do Agudo",
    "?gua Negra": "Agua Negra",
    "BR 158 federal road": "BR 158 federal road",
    "Porto Mariante 2": "Porto Mariante 2",
    "Sao Jose?": "Sao Jose",
    "Sao Jose? (embaixo do cemit?rio coletado por Price)": "Sao Jose"
}




In [332]:
# Cleaning special characters in 'state' and 'county' columns for Reptilia
rep_deepdive_cleaned = rep_deepdive.copy()
rep_deepdive_cleaned['state'] = rep_deepdive_cleaned['state'].replace(rep_state_dict_2)
rep_deepdive_cleaned['county'] = rep_deepdive_cleaned['county'].replace(rep_county_dict_2)

# Checking to see if the cleaning worked
print(find_special_chars(rep_deepdive_cleaned, 'county', 'reptilia'))
print(find_special_chars(rep_deepdive_cleaned, 'state', 'reptilia'))

['BR 158 federal road' 'Porto Mariante 2']
[]


In [333]:
# Special Characters Dictionary for Synapsida

syn_state_dict = {
    "Franch-Comt√É¬©": "Franche-Comte",
    "R√É¬≠o Negro": "Rio Negro",
    "Baden-W√É¬ºrttemberg": "Baden-Wurttemberg",
    "Rakovn√É¬≠k": "Rakovnik",
    "Paran√É¬°": "Parana",
    "H√É¬©rault": "Herault"
}
syn_county_dict = {
    "Valle Fert√É¬≠l": "Valle Fertil",
    "Sa√É¬¥ne-et-Loire": "Saone-et-Loire",
    "Wei√É¬üeritzkreis": "Weisseritzkreis",
    "Schw√É¬§bisch Hall": "Schwabisch Hall",
    "S√É¬£o Gabriel": "Sao Gabriel",
    "Candel√É¬°ria": "Candelaria",
    "Kotel√¢¬Ä¬ônichskii": "Kotelnichskii",
    "T√É¬ºbingen": "Tubingen",
    "√É¬úr√É¬ºmqi": "Urumqi",
    "H√É¬©rault": "Herault",
    "Valle F√É¬©rtil": "Valle Fertil",
    "Vila Est¬Éncia Nova": "Vila Estancia Nova",
    "Linha S?o Luiz": "Linha Sao Luiz",
    "Linha S?o Luiz ": "Linha Sao Luiz",
    "V¬†rzea do Agudo": "Varzea do Agudo",
    "Botucara¬°": "Botucara",
    "Rinc?o do Pinhal": "Rincao do Pinhal",
    "Linha V¬†rzea 2": "Linha Varzea 2",
    "Linha V¬†rzea 1": "Linha Varzea 1",
    "Rinc?o do Semi?o ": "Rincao do Semiao",
    "Rinc?o da Porta": "Rincao da Porta",
    "Catu¬áaba": "Catuaba",
    "Linha Fac?o": "Linha Facao",
    "Chiniqu¬†": "Chinique"
}

In [334]:
# Cleaning special characters in 'state' and 'county' columns for Reptilia
syn_deepdive_cleaned = syn_deepdive.copy()
syn_deepdive_cleaned['state'] = syn_deepdive_cleaned['state'].replace(syn_state_dict)
syn_deepdive_cleaned['county'] = syn_deepdive_cleaned['county'].replace(syn_county_dict)

# Checking to see if the cleaning worked
print(find_special_chars(syn_deepdive_cleaned, 'county', 'synapsida'))
print(find_special_chars(syn_deepdive_cleaned, 'state', 'synapsida'))

['Sesmaria do Pinhal 1' 'Linha Varzea 2' 'Linha Varzea 1'
 'Porto Mariante 2' 'BR 158 federal road']
[]


In [335]:
# Special Characters Dictionary for Temnospondyli

tem_state_dict = {
    "Baden-W√É¬ºrttemberg": "Baden-Wurttemberg",
    "Sk√É¬•ne": "Skane",
    "Th√É¬ºringen": "Thuringen",
    "Rakovn√É¬≠k": "Rakovnik",
    "Th√É¬ºringer Wald": "Thuringer Wald",
    "Th√É¬ºringia": "Thuringia",
    "Paran√É¬°": "Parana",
    "Baden-W√É¬ºrtteberg": "Baden-Wurttemberg"
}

tem_county_dict = euro_southam_location_dict = {
    "Wei√É¬üeritzkreis": "Weisseritzkreis",
    "Schw√É¬§bisch Hall": "Schwabisch Hall",
    "T√É¬ºbingen": "Tubingen",
    "Sa√É¬¥ne-et-Loire": "Saone-et-Loire",
    "S√É¬£o Jo√É¬£o do Pol√É¬™sine": "Sao Joao do Polesine",
    "S√É¬£o Gabriel": "Sao Gabriel",
    "Valle F√É¬©rtil": "Valle Fertil",
    "S?o Jer¬ìnimo da Serra": "Sao Jeronimo da Serra",
    "Toroqu¬†": "Toroqua"
}

In [336]:
# Cleaning special characters in 'state' and 'county' columns for Reptilia
tem_deepdive_cleaned = tem_deepdive.copy()
tem_deepdive_cleaned['state'] = tem_deepdive_cleaned['state'].replace(tem_state_dict)
tem_deepdive_cleaned['county'] = tem_deepdive_cleaned['county'].replace(tem_county_dict)

# Checking to see if the cleaning worked
print(find_special_chars(tem_deepdive_cleaned, 'county', 'temnospondyli'))
print(find_special_chars(tem_deepdive_cleaned, 'state', 'temnospondyli'))

[]
['Azad Jammu & Kashmir']


#### Imputing 'County' Nulls

#### Reptilia 'County' Nulls

In [337]:
def impute_county_nulls(df):
    df_imputed = df.copy()
    df_imputed.loc[df_imputed['county'] == '-', 'county'] = df_imputed['state']
    df_imputed.loc[df_imputed['county'].isna(), 'county'] = df_imputed['state']
    print('County hyphens count:', len(df_imputed.loc[df_imputed['county'] == '-', 'county']))
    print('County nulls count:', len(df_imputed.loc[df_imputed['county'].isna(), 'county']))
    print (df_imputed.loc[df_imputed['county'].isna()])
    return df_imputed

In [338]:
rep_deepdive_cleaned_imputed = impute_county_nulls(rep_deepdive_cleaned)
rep_deepdive_cleaned_imputed

County hyphens count: 0
County nulls count: 198
                genus county state occurrence_no  age_max  age_min
56     Neusticosaurus    NaN   NaN        150315    247.2   237.00
57         Mixosaurus    NaN   NaN        150316    247.2   237.00
58      Rhipaeosaurus    NaN   NaN        219963    266.9   264.28
94      Proganochelys    NaN   NaN        283825    216.7   213.20
95         Aetosaurus    NaN   NaN        283826    227.0   208.50
...               ...    ...   ...           ...      ...      ...
4178      Macrocnemus    NaN   NaN       1600125    243.8   239.70
4179      Lariosaurus    NaN   NaN       1600126    243.8   239.70
4180    Askeptosaurus    NaN   NaN       1600127    243.8   239.70
4181  Helveticosaurus    NaN   NaN       1600128    243.8   239.70
4235   Prosantosaurus    NaN   NaN       1648022    242.0   239.70

[198 rows x 6 columns]


Unnamed: 0,genus,county,state,occurrence_no,age_max,age_min
0,Icarosaurus,Hudson,New Jersey,146084,227.000,208.5
1,Rutiodon,Hudson,New Jersey,146085,227.000,208.5
2,Kuehneosuchus,Somerset,England,146086,208.500,201.4
3,Kuehneosaurus,Somerset,England,146087,208.500,201.4
4,Procolophon,Chris Hani,Eastern Cape,147521,251.902,247.2
...,...,...,...,...,...,...
4406,Eohyosaurus,FSTATE,FSTATE,N80,247.200,242.0
4407,Mesosuchus,ECAPE,ECAPE,N81,247.200,242.0
4408,Mesosuchus,ECAPE,ECAPE,N82,247.200,242.0
4409,Mesosuchus,ECAPE,ECAPE,N83,247.200,242.0


In [339]:
syn_deepdive_cleaned_imputed = impute_county_nulls(syn_deepdive_cleaned)
syn_deepdive_cleaned_imputed

County hyphens count: 0
County nulls count: 180
                genus county state occurrence_no  age_max  age_min
22          Diictodon    NaN   NaN        229268  264.280  254.140
31     Kuehneotherium    NaN   NaN        283943  208.500  201.400
32    Brachyzostrodon    NaN   NaN        283944  208.500  201.400
33       Lystrosaurus    NaN   NaN        286245  251.902  247.200
34       Lystrosaurus    NaN   NaN        286246  251.902  247.200
...               ...    ...   ...           ...      ...      ...
3062     Lystrosaurus    NaN   NaN       1587697  251.902  247.200
3072      Notictoides    NaN   NaN       1607015  251.902  247.200
3073        Dicynodon    NaN   NaN       1607103  298.900  251.902
3074        Dicynodon    NaN   NaN       1607105  298.900  251.902
3101    Aelurognathus    NaN   NaN       1651601  259.510  254.140

[180 rows x 6 columns]


Unnamed: 0,genus,county,state,occurrence_no,age_max,age_min
0,Edaphosaurus,Ohio,West Virginia,147591,303.7,298.9
1,Ophiacodon,Archer,Texas,148250,290.1,283.5
2,Therioherpeton,Santa Maria,Rio Grande do Sul,149619,237.0,208.5
3,Ophiacodon,San Juan,Utah,149745,298.9,290.1
4,Adelobasileus,Crosby,Texas,149837,227.0,208.5
...,...,...,...,...,...,...
3417,Santacruzodon,Schoenstatt,RS,N506,237.0,227.0
3418,Santacruzodon,Schoenstatt,RS,N507,237.0,227.0
3419,Dinodontosaurus,Pinheiro,RS,N508,242.0,237.0
3420,Dinodontosaurus,Porto Mariante 2,RS,N509,242.0,237.0


In [340]:
tem_deepdive_cleaned_imputed = impute_county_nulls(tem_deepdive_cleaned)
tem_deepdive_cleaned_imputed

County hyphens count: 0
County nulls count: 53
                genus county state occurrence_no  age_max  age_min
13     Platyoposaurus    NaN   NaN        219959  266.900   264.28
14     Platyoposaurus    NaN   NaN        219960  266.900   264.28
15         Melosaurus    NaN   NaN        219961  266.900   264.28
16        Dvinosaurus    NaN   NaN        219962  266.900   264.28
768    Mastodonsaurus    NaN   NaN        929804  242.000   237.00
844      Parotosuchus    NaN   NaN       1081097  247.200   242.00
845         Kryostega    NaN   NaN       1081098  247.200   242.00
967       Aphaneramma    NaN   NaN       1131219  251.200   248.90
968       Aphaneramma    NaN   NaN       1131223  251.200   248.90
970   Lyrocephaliscus    NaN   NaN       1131869  251.200   248.90
971   Lyrocephaliscus    NaN   NaN       1131895  251.200   248.90
972   Lyrocephaliscus    NaN   NaN       1131896  251.200   248.90
973   Lyrocephaliscus    NaN   NaN       1131898  251.200   248.90
974       Aphan

Unnamed: 0,genus,county,state,occurrence_no,age_max,age_min
0,Anconastes,Rio Arriba,New Mexico,146665,305.9,298.90
1,Broiliellus,Baylor,Texas,147563,283.5,273.01
2,Broiliellus,Clay,Texas,147564,298.9,286.10
3,Eryops,Archer,Texas,148243,290.1,283.50
4,Aspidosaurus,Archer,Texas,148244,290.1,283.50
...,...,...,...,...,...,...
2242,Rastosuchus,Morro do Mulato,PR,N5,266.9,264.30
2243,Rastosuchus,Morro do Mulato,PR,N6,266.9,264.30
2244,Rastosuchus,Morro do Mulato,PR,N7,266.9,264.30
2245,Compsocerops,Buriol,RS,N8,227.0,208.50


#### Dropping Remaining 'County' Null Rows

In [341]:
rep_deepdive_cleaned_imputed.isna().sum()

genus              0
county           198
state            212
occurrence_no      0
age_max            0
age_min            0
dtype: int64

In [342]:
syn_deepdive_cleaned_imputed.isna().sum()

genus              0
county           180
state            180
occurrence_no      0
age_max            0
age_min            0
dtype: int64

In [343]:
tem_deepdive_cleaned_imputed.isna().sum()

genus             0
county           53
state            55
occurrence_no     0
age_max           0
age_min           0
dtype: int64

# Final DeepDive Processing

In [351]:
# Dropping county nulls
rep_deepdive_final = rep_deepdive_cleaned_imputed.dropna(subset=['county'])
syn_deepdive_final = syn_deepdive_cleaned_imputed.dropna(subset=['county'])
tem_deepdive_final = tem_deepdive_cleaned_imputed.dropna(subset=['county'])

# Dropping 'state' column
rep_deepdive_final = rep_deepdive_final.drop(columns=['state', 'occurrence_no'])
syn_deepdive_final = syn_deepdive_final.drop(columns=['state', 'occurrence_no'])
tem_deepdive_final = tem_deepdive_final.drop(columns=['state', 'occurrence_no'])

# Creating a new 'Area' column filled with 1's
rep_deepdive_final['Area'] = 1
syn_deepdive_final['Area'] = 1
tem_deepdive_final['Area'] = 1

# Renaming columns
rep_deepdive_final.rename(columns={'genus': 'Taxon', 'county': 'Locality', 'age_max': 'MaxAge', 'age_min':'MinAge'}, inplace=True)
syn_deepdive_final.rename(columns={'genus': 'Taxon', 'county': 'Locality', 'age_max': 'MaxAge', 'age_min':'MinAge'}, inplace=True)
tem_deepdive_final.rename(columns={'genus': 'Taxon', 'county': 'Locality', 'age_max': 'MaxAge', 'age_min':'MinAge'}, inplace=True)


rep_deepdive_final.isna().sum(), syn_deepdive_final.isna().sum(), tem_deepdive_final.isna().sum()

(Taxon       0
 Locality    0
 MaxAge      0
 MinAge      0
 Area        0
 dtype: int64,
 Taxon       0
 Locality    0
 MaxAge      0
 MinAge      0
 Area        0
 dtype: int64,
 Taxon       0
 Locality    0
 MaxAge      0
 MinAge      0
 Area        0
 dtype: int64)

In [373]:
print(rep_deepdive_final.head()), print(syn_deepdive_final.head()), print(tem_deepdive_final.head())

           Taxon    Locality   MaxAge  MinAge  Area
0    Icarosaurus      Hudson  227.000   208.5     1
1       Rutiodon      Hudson  227.000   208.5     1
2  Kuehneosuchus    Somerset  208.500   201.4     1
3  Kuehneosaurus    Somerset  208.500   201.4     1
4    Procolophon  Chris Hani  251.902   247.2     1
            Taxon     Locality  MaxAge  MinAge  Area
0    Edaphosaurus         Ohio   303.7   298.9     1
1      Ophiacodon       Archer   290.1   283.5     1
2  Therioherpeton  Santa Maria   237.0   208.5     1
3      Ophiacodon     San Juan   298.9   290.1     1
4   Adelobasileus       Crosby   227.0   208.5     1
          Taxon    Locality  MaxAge  MinAge  Area
0    Anconastes  Rio Arriba   305.9  298.90     1
1   Broiliellus      Baylor   283.5  273.01     1
2   Broiliellus        Clay   298.9  286.10     1
3        Eryops      Archer   290.1  283.50     1
4  Aspidosaurus      Archer   290.1  283.50     1


(None, None, None)

In [353]:
rep_deepdive_final.info(), syn_deepdive_final.info(), tem_deepdive_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4213 entries, 0 to 4410
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Taxon     4213 non-null   object 
 1   Locality  4213 non-null   object 
 2   MaxAge    4213 non-null   float64
 3   MinAge    4213 non-null   float64
 4   Area      4213 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 197.5+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3242 entries, 0 to 3421
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Taxon     3242 non-null   object 
 1   Locality  3242 non-null   object 
 2   MaxAge    3242 non-null   float64
 3   MinAge    3242 non-null   float64
 4   Area      3242 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 152.0+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2194 entries, 0 to 2246
Data columns (total 5 columns):
 #   Col

(None, None, None)

In [354]:
rep_deepdive_final.describe(), syn_deepdive_final.describe(), tem_deepdive_final.describe()

(            MaxAge       MinAge    Area
 count  4213.000000  4213.000000  4213.0
 mean    241.141247   231.816973     1.0
 std      23.491122    25.601778     0.0
 min     201.400000   199.500000     1.0
 25%     227.000000   208.500000     1.0
 50%     239.700000   237.000000     1.0
 75%     259.510000   251.902000     1.0
 max     298.900000   293.520000     1.0,
             MaxAge       MinAge    Area
 count  3242.000000  3242.000000  3242.0
 mean    258.653055   251.159779     1.0
 std      18.348969    19.251846     0.0
 min     208.500000   201.400000     1.0
 25%     251.900000   247.200000     1.0
 50%     259.510000   254.140000     1.0
 75%     264.280000   259.510000     1.0
 max     309.800000   307.400000     1.0,
             MaxAge       MinAge    Area
 count  2194.000000  2194.000000  2194.0
 mean    270.526160   263.144923     1.0
 std      26.869761    28.234802     0.0
 min     208.500000   201.400000     1.0
 25%     251.200000   247.200000     1.0
 50%     266.9

In [367]:
def descriptions(df):
    print("Length of df " + str(len(df)))
    print("Unique taxa count " + str(len(df['Taxon'].value_counts())))
    print("Unique localities count " + str(len(df['Locality'].value_counts())))

In [368]:
descriptions(rep_deepdive_final)

Length of df 4213
Unique taxa count 676
Unique localities count 479


In [369]:
descriptions(syn_deepdive_final)

Length of df 3242
Unique taxa count 506
Unique localities count 321


In [370]:
descriptions(tem_deepdive_final)

Length of df 2194
Unique taxa count 257
Unique localities count 309


In [374]:
# Saving final deepdive input datasets
rep_deepdive_final.to_csv('reptilia_processed_data/reptilia_deepdive_final.csv', index=False)
syn_deepdive_final.to_csv('synapsida_processed_data/synapsida_deepdive_final.csv', index=False)
tem_deepdive_final.to_csv('temnospondyli_processed_data/temnospondyli_deepdive_final.csv', index=False)