This EDA nb is using occurrence tables e-mailed from Tiago on 9/27 (duplicated, with rotated lat and long columns)

## Imports + Loading In Data

In [283]:
import pandas as pd


In [284]:
# %pip install openpyxl

In [285]:
# Loading in data w/ rotated coordinates
rep = pd.read_excel('reptilia_occ_genus.xlsm', engine='openpyxl')
syn = pd.read_excel('synapsida_occ_genus.xlsm', engine='openpyxl')
tem = pd.read_excel('temnospondyli_occ_genus.xlsm', engine='openpyxl')

In [286]:
rep['county'].isna().sum()

1495

In [287]:
rep['county'].value_counts()

Apache                    234
Comanche                  152
Rio Arriba                 88
Franklin                   78
Gloucestershire            67
                         ... 
Malmyzh                     1
Anson                       1
Sa√É¬¥ne-et-Loire           1
Huesca                      1
Aliwal North Commonage      1
Name: county, Length: 355, dtype: int64

## Saving Nulls as CSV's

In [288]:
# Seeing how many nulls are in each column will help us decide which columns to drop
nulls_rep = rep.isna().sum()
nulls_syn = syn.isna().sum()
nulls_tem = tem.isna().sum()

# Concatenating the nulls into a single dataframe based on a shared index
# This will allow us to compare the nulls across the three datasets
nulls = pd.concat([nulls_rep, nulls_syn, nulls_tem], axis=1)
nulls.columns = ['Reptilia', 'Synapsida', 'Temnospondyli']

nulls.to_csv('nulls.csv')

In [289]:
# len of nulls df is diff from len of columns in rep
# which means there are columns that are not shared
len(nulls.index), len(rep.columns)

(119, 94)

In [290]:
# Finding which columns rep, syn, and tem do not share
# Crude code, just to see if there are any differences, not finding all differences
# Because any not-shared columns wouldn't be included in the analysis anyway

if rep.columns.all() == syn.columns.all() == tem.columns.all():
    print('All columns are the same')
else:
    diff = rep.columns.difference(syn.columns).difference(tem.columns)
    print(diff)

Index(['abund_in_sediment', 'artifacts', 'component_comments', 'concentration',
       'fossilsfrom1', 'fossilsfrom2', 'lagerstatten', 'lithadj2', 'localbed',
       'localsection', 'minor_lithology2', 'orientation', 'reference_no.x',
       'temporal_resolution'],
      dtype='object')


## Confirming that these are Datasets w/ duplicated rows (eda_duplicates)

In [291]:
# Loading in datasets that I had previously duplicated (i.e., the output of eda_duplicates.ipynb)
rep_old = pd.read_csv('occurrence_tables_1/reptilia_dup.csv')
syn_old = pd.read_csv('occurrence_tables_1/synapsida_dup.csv')
tem_old = pd.read_csv('occurrence_tables_1/temnospondyli_dup.csv')

In [292]:
# Checking that the lengths of the original and new datasets are the same
len(rep_old), len(rep), len(syn_old), len(syn), len(tem_old), len(tem)

(4411, 4411, 3422, 3422, 2247, 2247)

In [293]:
# One last double-check to make sure that the datasets are the same
# All 'abund_values' for "individuals" should be 1
individuals = rep[rep['abund_unit'] == 'individuals']
individuals['abund_value'].value_counts()

1.0    2032
Name: abund_value, dtype: int64

## Filtering to BDNN columns

In [294]:
rep.columns

Index(['id', 'Rotated Lat', 'Rotated Lon', 'occurrence_no', 'pres_mode',
       'preservation_quality', 'common_body_parts', 'abund_value',
       'abund_unit', 'class', 'family', 'genus', 'taxon_environment',
       'life_habit', 'diet', 'early_interval', 'late_interval', 'time_bins',
       'museum', 'collection_no', 'age_max', 'age_min', 'age_median',
       'age_uncer_range', 'lng', 'lat', 'environment', 'occurrence_comments',
       'cc', 'state', 'county', 'formation', 'stratgroup', 'member', 'zone',
       'accepted_rank', 'ref_author', 'ref_pubyr', 'reference_no.x',
       'collection_name', 'collection_subset', 'collection_aka',
       'latlng_basis', 'latlng_precision', 'geogscale', 'geogcomments',
       'paleomodel', 'geoplate', 'paleoage', 'paleolng', 'paleolat',
       'paleomodel2', 'geoplate2', 'paleoage2', 'paleolng2', 'paleolat2',
       'paleomodel3', 'geoplate3', 'paleoage3', 'paleolng3', 'paleolat3',
       'protected', 'stratscale', 'localsection', 'localbed', 'st

In [295]:
# Still waiting on 'niche' info from Arielli
syn_filtered = syn[['id', 'Rotated Lat', 'Rotated Lon', 'genus', 'taxon_environment', 
                    'age_max', 'age_min']]
syn_filtered.isna().sum()

id                     0
Rotated Lat            0
Rotated Lon            0
genus                  0
taxon_environment    310
age_max                0
age_min                0
dtype: int64

In [296]:
syn_filtered.shape, syn_filtered['id'].nunique()

((3422, 7), 3422)

In [297]:
syn_filtered.head()

Unnamed: 0,id,Rotated Lat,Rotated Lon,genus,taxon_environment,age_max,age_min
0,3422,-0.4818,-24.8936,Edaphosaurus,terrestrial,303.7,298.9
1,3421,-2.9808,-38.7628,Ophiacodon,terrestrial,290.1,283.5
2,3420,-49.4771,-10.0175,Therioherpeton,terrestrial,237.0,208.5
3,3419,5.8574,-44.6091,Ophiacodon,terrestrial,298.9,290.1
4,3418,-1.4701,-41.4649,Adelobasileus,terrestrial,227.0,208.5


In [298]:
# Still waiting on 'niche' info from Arielli
tem_filtered = tem[['id', 'Rotated Lat', 'Rotated Lon', 'genus', 'taxon_environment', 
                    'age_max', 'age_min']]
tem_filtered.isna().sum()

id                    0
Rotated Lat           0
Rotated Lon           0
genus                 0
taxon_environment    14
age_max               0
age_min               0
dtype: int64

In [299]:
# Still waiting on 'niche' info from Arielli
rep_filtered = rep[['id', 'Rotated Lat', 'Rotated Lon', 'genus', 'taxon_environment', 
                    'age_max', 'age_min']]
rep_filtered.isna().sum()

id                   0
Rotated Lat          0
Rotated Lon          0
genus                0
taxon_environment    0
age_max              0
age_min              0
dtype: int64

In [300]:
rep_filtered['taxon_environment'].head(20)

0     terrestrial
1     terrestrial
2     terrestrial
3     terrestrial
4     terrestrial
5     terrestrial
6     terrestrial
7     terrestrial
8     terrestrial
9     terrestrial
10    terrestrial
11    terrestrial
12    terrestrial
13    terrestrial
14    terrestrial
15    terrestrial
16    terrestrial
17    terrestrial
18    terrestrial
19    terrestrial
Name: taxon_environment, dtype: object

In [301]:
rep['environment'].head(20)

0     lacustrine - large
1     lacustrine - large
2           fissure fill
3           fissure fill
4         fluvial indet.
5     terrestrial indet.
6     terrestrial indet.
7           fissure fill
8           fissure fill
9           fissure fill
10          fissure fill
11          fissure fill
12          fissure fill
13          fissure fill
14          fissure fill
15          fissure fill
16          fissure fill
17          fissure fill
18          fissure fill
19          fissure fill
Name: environment, dtype: object

In [302]:
rep_filtered.to_csv('reptilia_processed_data/reptilia_bdnn.csv')
syn_filtered.to_csv('synapsida_processed_data/synapsida_bdnn.csv')
tem_filtered.to_csv('temnospondyli_processed_data/temnospondyli_bdnn.csv')


## Prepping DeepDive columns

In [303]:
rep_deepdive = rep[['id', 'genus', 'county', 'state', 'occurrence_no', 'age_max', 'age_min', 'cc']]
rep_deepdive.head()

Unnamed: 0,id,genus,county,state,occurrence_no,age_max,age_min,cc
0,4411,Icarosaurus,Hudson,New Jersey,146084,227.0,208.5,US
1,4410,Rutiodon,Hudson,New Jersey,146085,227.0,208.5,US
2,4409,Kuehneosuchus,Somerset,England,146086,208.5,201.4,UK
3,4408,Kuehneosaurus,Somerset,England,146087,208.5,201.4,UK
4,4407,Procolophon,Chris Hani,Eastern Cape,147521,251.902,247.2,ZA


In [304]:
syn_deepdive = syn[['id', 'genus', 'county', 'state', 'occurrence_no','age_max', 'age_min', 'cc']]
tem_deepdive = tem[['id', 'genus', 'county', 'state', 'occurrence_no', 'age_max', 'age_min', 'cc']]

### 'County' and 'State' Cols EDA

#### Are 'County' Null Counts Reasonable?

In [305]:
rep_deepdive.isna().sum()

# 'county' has a lot of nulls, so I'm going to look back at the older datasets and see if that seems correct or
# if an error occurred when I was duplicating the rows

id                  0
genus               0
county           1495
state             212
occurrence_no       0
age_max             0
age_min             0
cc                  8
dtype: int64

In [306]:
# Making sure occurrence no has >1 values, since there are duplicate rows (duplicated in eda_duplicates.ipynb)
rep['occurrence_no'].value_counts()

1285964    300
629477      91
1586159     37
902070      33
486411      25
          ... 
830763       1
830777       1
830855       1
830859       1
N84          1
Name: occurrence_no, Length: 3145, dtype: int64

In [307]:
# Loading in my original duplication output
rep_original = pd.read_csv('occurrence_tables_1/reptilia_dup.csv')
rep_original_filtered = rep_original[['genus', 'county', 'state']]
rep_original_filtered.isna().sum()

genus        0
county    1495
state      212
dtype: int64

In [308]:
# Loading in the very first dataset (raw, no rotated lat and long)
rep_old = pd.read_csv('occurrence_tables_1/reptilia_genus_occ_all.csv', encoding = 'latin1')
rep_old_filtered = rep_old[['genus', 'county', 'state']]
rep_old_filtered.isna().sum()

genus       0
county    920
state     197
dtype: int64

#### Saving Just Duplicated Rows

In [309]:
# Saving off just the parts in the deepdive dataset that were duplicated occurrences (i.e., the rows that were duplicated in eda_duplicates.ipynb)
# Trying to see if there's a pattern in the 'county' nulls that I can fix
rep_deepdive_dups = rep_deepdive[rep_deepdive.duplicated(subset=['occurrence_no'], keep=False)]
rep_deepdive_dups.to_csv('reptilia_processed_data/reptilia_deepdive_dups.csv')

syn_deepdive_dups = syn_deepdive[syn_deepdive.duplicated(subset=['occurrence_no'], keep=False)]
syn_deepdive_dups.to_csv('synapsida_processed_data/synapsida_deepdive_dups.csv')

tem_deepdive_dups = tem_deepdive[tem_deepdive.duplicated(subset=['occurrence_no'], keep=False)]
tem_deepdive_dups.to_csv('temnospondyli_processed_data/temnospondyli_deepdive_dups.csv')

#### Fixing Nonsensical Strings in 'County' and 'State'

In [310]:
# Return rows in 'county' and 'state' columns that contain any non-letter characters
import re

# Function for outputting a CSV of unique values in a column that contain special characters
def find_special_chars(df, col, clade):
    df2 = df.copy()
    df2[col] = df2[col].fillna('') # Have to fill empty values with something to use regex
    non_letter = df2[df2[col].str.contains(r'[^a-zA-Z\s\-/\'\,\(\)]', na=False)] # Regex for non-letter characters, excluding hyphens and forward slashes
    unique = non_letter[col].unique()
    # Saving unique values to a CSV named after the clade and column
    unique_df = pd.DataFrame(unique)
    # unique_df.to_csv(f'{clade}_processed_data/{clade}_{col}_special_chars.csv') # Used this already during first run, so commenting out
    # ^ Commenting out so I don't overwrite the files I've already saved. This is just for the first run.
    return unique

In [311]:
rep_deepdive

Unnamed: 0,id,genus,county,state,occurrence_no,age_max,age_min,cc
0,4411,Icarosaurus,Hudson,New Jersey,146084,227.000,208.5,US
1,4410,Rutiodon,Hudson,New Jersey,146085,227.000,208.5,US
2,4409,Kuehneosuchus,Somerset,England,146086,208.500,201.4,UK
3,4408,Kuehneosaurus,Somerset,England,146087,208.500,201.4,UK
4,4407,Procolophon,Chris Hani,Eastern Cape,147521,251.902,247.2,ZA
...,...,...,...,...,...,...,...,...
4406,5,Eohyosaurus,-,FSTATE,N80,247.200,242.0,ZA
4407,4,Mesosuchus,-,ECAPE,N81,247.200,242.0,ZA
4408,3,Mesosuchus,-,ECAPE,N82,247.200,242.0,ZA
4409,2,Mesosuchus,-,ECAPE,N83,247.200,242.0,ZA


In [312]:
# Reptilia
find_special_chars(rep_deepdive, 'county', 'reptilia'), find_special_chars(rep_deepdive, 'state', 'reptilia')

(array(['Valle Fert√É¬≠l', 'Valle F√É¬©rtil', 'Vend√É¬©e',
        'Wei√É¬üenburg-Gunzenhausen', 'S√É¬£o Gabriel', 'L√É¬∂rrach',
        'Portel√É¬¢ndia', 'Perol√É¬¢ndia', 'S√É¬£o Mateus do Sul',
        'Tatu√É¬≠', 'Rebou√É¬ßas', 'Candel√É¬°ria', 'N√É¬ºrnberger Land',
        'Schw√É¬§bisch Hall', 'S√É¬£o Pedro do Sul', 'G√É¬∂ttingen',
        'Sa√É¬¥ne-et-Loire', 'Baden-W√É¬ºrttemberg', 'Th√É¬ºringen',
        'T√É¬ºbingen', 'S√É¬¢one-et-Loire', 'H√É¬©rault',
        'S√É¬£o Jo√É¬£o do Pol√É¬™sine', 'Yuan√¢¬Ä¬ôan',
        'Tarnowskie G√É¬≥ry', 'Arroio do S¬õ', 'Linha S?o Luiz',
        'Toroqu√ø', 'Ribeir?o', 'Rinc?o dos Weiss', 'Acegu√ø',
        'Sesmaria do Pinhal 1',
        'cff Rafael prov√øvel material do MNRJ se sao os mesmo do livro tombo, Alemoa',
        'Cerrito?', 'Sesmaria do Pinhal 2', 'V√ørzea do Agudo',
        '?gua Negra', 'BR 158 federal road', 'Porto Mariante 2',
        'Sao Jose?', 'Sao Jose? (embaixo do cemit?rio coletado por Price)'],
       dtype=object),
 

In [313]:
# Temnospondyli
find_special_chars(tem_deepdive, 'county', 'temnospondyli'), find_special_chars(tem_deepdive, 'state', 'temnospondyli')

(array(['Wei√É¬üeritzkreis', 'Schw√É¬§bisch Hall', 'T√É¬ºbingen',
        'Sa√É¬¥ne-et-Loire', 'S√É¬£o Jo√É¬£o do Pol√É¬™sine',
        'S√É¬£o Gabriel', 'Valle F√É¬©rtil', 'S?o Jer¬ìnimo da Serra',
        'Toroqu¬†'], dtype=object),
 array(['Baden-W√É¬ºrttemberg', 'Sk√É¬•ne', 'Th√É¬ºringen',
        'Azad Jammu & Kashmir', 'Rakovn√É¬≠k', 'Th√É¬ºringer Wald',
        'Th√É¬ºringia', 'Paran√É¬°', 'Baden-W√É¬ºrtteberg'], dtype=object))

In [314]:
# Synapsida
find_special_chars(syn_deepdive, 'county', 'synapsida'), find_special_chars(syn_deepdive, 'state', 'synapsida')

(array(['Valle Fert√É¬≠l', 'Sa√É¬¥ne-et-Loire', 'Wei√É¬üeritzkreis',
        'Schw√É¬§bisch Hall', 'S√É¬£o Gabriel', 'Candel√É¬°ria',
        'Kotel√¢¬Ä¬ônichskii', 'T√É¬ºbingen', '√É¬úr√É¬ºmqi', 'H√É¬©rault',
        'Valle F√É¬©rtil', 'Vila Est¬Éncia Nova', 'Linha S?o Luiz',
        'Linha S?o Luiz ', 'Sesmaria do Pinhal 1', 'V¬†rzea do Agudo',
        'Botucara¬°', 'Rinc?o do Pinhal', 'Linha V¬†rzea 2',
        'Linha V¬†rzea 1', 'Rinc?o do Semi?o ', 'Porto Mariante 2',
        'Rinc?o da Porta', 'Catu¬áaba', 'BR 158 federal road',
        'Linha Fac?o', 'Chiniqu¬†'], dtype=object),
 array(['Franch-Comt√É¬©', 'R√É¬≠o Negro', 'Baden-W√É¬ºrttemberg',
        'Rakovn√É¬≠k', 'Paran√É¬°', 'H√É¬©rault'], dtype=object))

In [315]:
# Special Characters Dictionary for Reptilia
# To map special characters to their correct values

rep_state_dict = {
    "Baden-W√É¬ºrttemberg": "Baden-Wurttemberg",
    "Nieder√É¬∂sterreich": "Niederosterreich",
    "T√É¬°chira": "Tachira",
    "Sa√É¬Øda": "Saida",
    "Sk√É¬•ne": "Skane",
    "Goi√É¬°s": "Goias",
    "Paran√É¬°": "Parana",
    "S√É¬£o Paulo": "Sao Paulo",
    "Cear√É¬°": "Ceara",
    "Guair√É¬°": "Guaira",
    "Bourgogne-Franche-Comt√É¬©": "Bourgogne-Franche-Comte",
    "Baden-W√É¬ºrttenburg": "Baden-Wurttemberg",
    "Nieder√É¬∂stereich": "Niederosterreich",
    "Bayern (Bavaria)": "Bayern",
    "Baden-W√É¬ºrtteberg": "Baden-Wurttemberg",
    "Baden-W√É¬ºrttemburg": "Baden-Wurttemberg",
    "Bansk√É¬° Bystrica": "Banska Bystrica",
    "Graub√É¬ºnden": "Graubunden"
}

rep_county_dict = {
    "Valle Fert√É¬≠l": "Valle Fertil",
    "Valle F√É¬©rtil": "Valle Fertil",
    "Vend√É¬©e": "Vendee",
    "Wei√É¬üenburg-Gunzenhausen": "Weissenburg-Gunzenhausen",
    "S√É¬£o Gabriel": "Sao Gabriel",
    "L√É¬∂rrach": "Lorrach",
    "Portel√É¬¢ndia": "Portelandia",
    "Perol√É¬¢ndia": "Perolandia",
    "S√É¬£o Mateus do Sul": "Sao Mateus do Sul",
    "Tatu√É¬≠": "Tatui",
    "Rebou√É¬ßas": "Reboucas",
    "Candel√É¬°ria": "Candelaria",
    "N√É¬ºrnberger Land": "Nurnberger Land",
    "Schw√É¬§bisch Hall": "Schwabisch Hall",
    "S√É¬£o Pedro do Sul": "Sao Pedro do Sul",
    "G√É¬∂ttingen": "Gottingen",
    "Sa√É¬¥ne-et-Loire": "Saone-et-Loire",
    "Baden-W√É¬ºrttemberg": "Baden-Wurttemberg",
    "Th√É¬ºringen": "Thuringen",
    "T√É¬ºbingen": "Tubingen",
    "S√É¬¢one-et-Loire": "Saone-et-Loire",
    "H√É¬©rault": "Herault",
    "S√É¬£o Jo√É¬£o do Pol√É¬™sine": "Sao Joao do Polesine",
    "Yuan√¢¬Ä¬ôan": "Yuan'an",
    "Tarnowskie G√É¬≥ry": "Tarnowskie Gory",
    "Arroio do S¬õ": "Arroio do So",
    "Linha S?o Luiz": "Linha Sao Luiz",
    "Toroqu√ø": "Toroqua",
    "Ribeir?o": "Ribeirao",
    "Rinc?o dos Weiss": "Rincao dos Weiss",
    "Acegu√ø": "Acegua",
    "Sesmaria do Pinhal 1": "Sesmaria do Pinhal 1",
    "cff Rafael prov√øvel material do MNRJ se sao os mesmo do livro tombo, Alemoa": "cff Rafael provavel material do MNRJ se sao os mesmo do livro tombo, Alemoa",
    "Cerrito?": "Cerrito",
    "Sesmaria do Pinhal 2": "Sesmaria do Pinhal 2",
    "V√ørzea do Agudo": "Varzea do Agudo",
    "?gua Negra": "Agua Negra",
    "BR 158 federal road": "BR 158 federal road",
    "Porto Mariante 2": "Porto Mariante 2",
    "Sao Jose?": "Sao Jose",
    "Sao Jose? (embaixo do cemit?rio coletado por Price)": "Sao Jose"
}




In [316]:
# Cleaning special characters in 'state' and 'county' columns for Reptilia
rep_deepdive_cleaned = rep_deepdive.copy()
rep_deepdive_cleaned['state'] = rep_deepdive_cleaned['state'].replace(rep_state_dict)
rep_deepdive_cleaned['county'] = rep_deepdive_cleaned['county'].replace(rep_county_dict)

# Checking to see if the cleaning worked
print(find_special_chars(rep_deepdive_cleaned, 'county', 'reptilia'))
print(find_special_chars(rep_deepdive_cleaned, 'state', 'reptilia'))

['Sesmaria do Pinhal 1' 'Sesmaria do Pinhal 2' 'BR 158 federal road'
 'Porto Mariante 2']
[]


In [317]:
# Special Characters Dictionary for Synapsida

syn_state_dict = {
    "Franch-Comt√É¬©": "Franche-Comte",
    "R√É¬≠o Negro": "Rio Negro",
    "Baden-W√É¬ºrttemberg": "Baden-Wurttemberg",
    "Rakovn√É¬≠k": "Rakovnik",
    "Paran√É¬°": "Parana",
    "H√É¬©rault": "Herault"
}
syn_county_dict = {
    "Valle Fert√É¬≠l": "Valle Fertil",
    "Sa√É¬¥ne-et-Loire": "Saone-et-Loire",
    "Wei√É¬üeritzkreis": "Weisseritzkreis",
    "Schw√É¬§bisch Hall": "Schwabisch Hall",
    "S√É¬£o Gabriel": "Sao Gabriel",
    "Candel√É¬°ria": "Candelaria",
    "Kotel√¢¬Ä¬ônichskii": "Kotelnichskii",
    "T√É¬ºbingen": "Tubingen",
    "√É¬úr√É¬ºmqi": "Urumqi",
    "H√É¬©rault": "Herault",
    "Valle F√É¬©rtil": "Valle Fertil",
    "Vila Est¬Éncia Nova": "Vila Estancia Nova",
    "Linha S?o Luiz": "Linha Sao Luiz",
    "Linha S?o Luiz ": "Linha Sao Luiz",
    "V¬†rzea do Agudo": "Varzea do Agudo",
    "Botucara¬°": "Botucara",
    "Rinc?o do Pinhal": "Rincao do Pinhal",
    "Linha V¬†rzea 2": "Linha Varzea 2",
    "Linha V¬†rzea 1": "Linha Varzea 1",
    "Rinc?o do Semi?o ": "Rincao do Semiao",
    "Rinc?o da Porta": "Rincao da Porta",
    "Catu¬áaba": "Catuaba",
    "Linha Fac?o": "Linha Facao",
    "Chiniqu¬†": "Chinique"
}

In [318]:
# Cleaning special characters in 'state' and 'county' columns for Reptilia
syn_deepdive_cleaned = syn_deepdive.copy()
syn_deepdive_cleaned['state'] = syn_deepdive_cleaned['state'].replace(syn_state_dict)
syn_deepdive_cleaned['county'] = syn_deepdive_cleaned['county'].replace(syn_county_dict)

# Checking to see if the cleaning worked
print(find_special_chars(syn_deepdive_cleaned, 'county', 'synapsida'))
print(find_special_chars(syn_deepdive_cleaned, 'state', 'synapsida'))

['Sesmaria do Pinhal 1' 'Linha Varzea 2' 'Linha Varzea 1'
 'Porto Mariante 2' 'BR 158 federal road']
[]


In [319]:
# Special Characters Dictionary for Temnospondyli

tem_state_dict = {
    "Baden-W√É¬ºrttemberg": "Baden-Wurttemberg",
    "Sk√É¬•ne": "Skane",
    "Th√É¬ºringen": "Thuringen",
    "Rakovn√É¬≠k": "Rakovnik",
    "Th√É¬ºringer Wald": "Thuringer Wald",
    "Th√É¬ºringia": "Thuringia",
    "Paran√É¬°": "Parana",
    "Baden-W√É¬ºrtteberg": "Baden-Wurttemberg"
}

tem_county_dict = euro_southam_location_dict = {
    "Wei√É¬üeritzkreis": "Weisseritzkreis",
    "Schw√É¬§bisch Hall": "Schwabisch Hall",
    "T√É¬ºbingen": "Tubingen",
    "Sa√É¬¥ne-et-Loire": "Saone-et-Loire",
    "S√É¬£o Jo√É¬£o do Pol√É¬™sine": "Sao Joao do Polesine",
    "S√É¬£o Gabriel": "Sao Gabriel",
    "Valle F√É¬©rtil": "Valle Fertil",
    "S?o Jer¬ìnimo da Serra": "Sao Jeronimo da Serra",
    "Toroqu¬†": "Toroqua"
}

In [320]:
# Cleaning special characters in 'state' and 'county' columns for Reptilia
tem_deepdive_cleaned = tem_deepdive.copy()
tem_deepdive_cleaned['state'] = tem_deepdive_cleaned['state'].replace(tem_state_dict)
tem_deepdive_cleaned['county'] = tem_deepdive_cleaned['county'].replace(tem_county_dict)

# Checking to see if the cleaning worked
print(find_special_chars(tem_deepdive_cleaned, 'county', 'temnospondyli'))
print(find_special_chars(tem_deepdive_cleaned, 'state', 'temnospondyli'))

[]
['Azad Jammu & Kashmir']


In [321]:
# Note that some data will be lost, since some special character entries map to the same corrected value
rep_deepdive['county'].nunique(), rep_deepdive_cleaned['county'].nunique(), rep_deepdive['state'].nunique(), rep_deepdive_cleaned['state'].nunique()

(355, 349, 209, 202)

In [322]:
syn_deepdive['county'].nunique(), syn_deepdive_cleaned['county'].nunique(), syn_deepdive['state'].nunique(), syn_deepdive_cleaned['state'].nunique()

(255, 252, 118, 118)

In [323]:
tem_deepdive['county'].nunique(), tem_deepdive_cleaned['county'].nunique(), tem_deepdive['state'].nunique(), tem_deepdive_cleaned['state'].nunique()

(222, 222, 142, 139)

In [324]:
# Checking that overall shape of the datasets hasn't changed
len(rep_deepdive), len(rep_deepdive_cleaned), len(syn_deepdive), len(syn_deepdive_cleaned), len(tem_deepdive), len(tem_deepdive_cleaned)

(4411, 4411, 3422, 3422, 2247, 2247)

#### Imputing 'County' Nulls w/ 'State

In [325]:
# First saving off a copy of the original 'county' column before we impute nulls
rep_deepdive_cleaned['county_original'] = rep_deepdive_cleaned['county']   
syn_deepdive_cleaned['county_original'] = syn_deepdive_cleaned['county']
tem_deepdive_cleaned['county_original'] = tem_deepdive_cleaned['county']

In [326]:
rep_deepdive_cleaned.head()

Unnamed: 0,id,genus,county,state,occurrence_no,age_max,age_min,cc,county_original
0,4411,Icarosaurus,Hudson,New Jersey,146084,227.0,208.5,US,Hudson
1,4410,Rutiodon,Hudson,New Jersey,146085,227.0,208.5,US,Hudson
2,4409,Kuehneosuchus,Somerset,England,146086,208.5,201.4,UK,Somerset
3,4408,Kuehneosaurus,Somerset,England,146087,208.5,201.4,UK,Somerset
4,4407,Procolophon,Chris Hani,Eastern Cape,147521,251.902,247.2,ZA,Chris Hani


In [327]:
def impute_county_nulls(df):
    df_imputed = df.copy()
    df_imputed.loc[df_imputed['county'] == '-', 'county'] = df_imputed['state']
    df_imputed.loc[df_imputed['county'].isna(), 'county'] = df_imputed['state']
    print('County hyphens count:', len(df_imputed.loc[df_imputed['county'] == '-', 'county']))
    print('County nulls count:', len(df_imputed.loc[df_imputed['county'].isna(), 'county']))
    print (df_imputed.loc[df_imputed['county'].isna()])
    return df_imputed

In [328]:
rep_deepdive_cleaned_imputed = impute_county_nulls(rep_deepdive_cleaned)
rep_deepdive_cleaned_imputed

County hyphens count: 0
County nulls count: 198
        id            genus county state occurrence_no  age_max  age_min  cc  \
56    4355   Neusticosaurus    NaN   NaN        150315    247.2   237.00  CH   
57    4354       Mixosaurus    NaN   NaN        150316    247.2   237.00  CH   
58    4353    Rhipaeosaurus    NaN   NaN        219963    266.9   264.28  RU   
94    4317    Proganochelys    NaN   NaN        283825    216.7   213.20  GL   
95    4315       Aetosaurus    NaN   NaN        283826    227.0   208.50  GL   
...    ...              ...    ...   ...           ...      ...      ...  ..   
4178   233      Macrocnemus    NaN   NaN       1600125    243.8   239.70  CH   
4179   232      Lariosaurus    NaN   NaN       1600126    243.8   239.70  CH   
4180   231    Askeptosaurus    NaN   NaN       1600127    243.8   239.70  CH   
4181   230  Helveticosaurus    NaN   NaN       1600128    243.8   239.70  CH   
4235   176   Prosantosaurus    NaN   NaN       1648022    242.0   239.70

Unnamed: 0,id,genus,county,state,occurrence_no,age_max,age_min,cc,county_original
0,4411,Icarosaurus,Hudson,New Jersey,146084,227.000,208.5,US,Hudson
1,4410,Rutiodon,Hudson,New Jersey,146085,227.000,208.5,US,Hudson
2,4409,Kuehneosuchus,Somerset,England,146086,208.500,201.4,UK,Somerset
3,4408,Kuehneosaurus,Somerset,England,146087,208.500,201.4,UK,Somerset
4,4407,Procolophon,Chris Hani,Eastern Cape,147521,251.902,247.2,ZA,Chris Hani
...,...,...,...,...,...,...,...,...,...
4406,5,Eohyosaurus,FSTATE,FSTATE,N80,247.200,242.0,ZA,-
4407,4,Mesosuchus,ECAPE,ECAPE,N81,247.200,242.0,ZA,-
4408,3,Mesosuchus,ECAPE,ECAPE,N82,247.200,242.0,ZA,-
4409,2,Mesosuchus,ECAPE,ECAPE,N83,247.200,242.0,ZA,-


In [329]:
syn_deepdive_cleaned_imputed = impute_county_nulls(syn_deepdive_cleaned)
syn_deepdive_cleaned_imputed

County hyphens count: 0
County nulls count: 180
        id            genus county state occurrence_no  age_max  age_min  cc  \
22    3400        Diictodon    NaN   NaN        229268  264.280  254.140  ZA   
31    3391   Kuehneotherium    NaN   NaN        283943  208.500  201.400  GL   
32    3390  Brachyzostrodon    NaN   NaN        283944  208.500  201.400  GL   
33    3389     Lystrosaurus    NaN   NaN        286245  251.902  247.200  AA   
34    3388     Lystrosaurus    NaN   NaN        286246  251.902  247.200  AA   
...    ...              ...    ...   ...           ...      ...      ...  ..   
3062   360     Lystrosaurus    NaN   NaN       1587697  251.902  247.200  AA   
3072   350      Notictoides    NaN   NaN       1607015  251.902  247.200  AA   
3073   349        Dicynodon    NaN   NaN       1607103  298.900  251.902  ZM   
3074   348        Dicynodon    NaN   NaN       1607105  298.900  251.902  MZ   
3101   321    Aelurognathus    NaN   NaN       1651601  259.510  254.140

Unnamed: 0,id,genus,county,state,occurrence_no,age_max,age_min,cc,county_original
0,3422,Edaphosaurus,Ohio,West Virginia,147591,303.7,298.9,US,Ohio
1,3421,Ophiacodon,Archer,Texas,148250,290.1,283.5,US,Archer
2,3420,Therioherpeton,Santa Maria,Rio Grande do Sul,149619,237.0,208.5,BR,Santa Maria
3,3419,Ophiacodon,San Juan,Utah,149745,298.9,290.1,US,San Juan
4,3418,Adelobasileus,Crosby,Texas,149837,227.0,208.5,US,Crosby
...,...,...,...,...,...,...,...,...,...
3417,5,Santacruzodon,Schoenstatt,RS,N506,237.0,227.0,BR,Schoenstatt
3418,4,Santacruzodon,Schoenstatt,RS,N507,237.0,227.0,BR,Schoenstatt
3419,3,Dinodontosaurus,Pinheiro,RS,N508,242.0,237.0,BR,Pinheiro
3420,2,Dinodontosaurus,Porto Mariante 2,RS,N509,242.0,237.0,BR,Porto Mariante 2


In [330]:
tem_deepdive_cleaned_imputed = impute_county_nulls(tem_deepdive_cleaned)
tem_deepdive_cleaned_imputed

County hyphens count: 0
County nulls count: 53
        id            genus county state occurrence_no  age_max  age_min  cc  \
13    2234   Platyoposaurus    NaN   NaN        219959  266.900   264.28  RU   
14    2233   Platyoposaurus    NaN   NaN        219960  266.900   264.28  RU   
15    2232       Melosaurus    NaN   NaN        219961  266.900   264.28  RU   
16    2231      Dvinosaurus    NaN   NaN        219962  266.900   264.28  RU   
768   1479   Mastodonsaurus    NaN   NaN        929804  242.000   237.00  DE   
844   1403     Parotosuchus    NaN   NaN       1081097  247.200   242.00  AA   
845   1402        Kryostega    NaN   NaN       1081098  247.200   242.00  AA   
967   1280      Aphaneramma    NaN   NaN       1131219  251.200   248.90  SJ   
968   1279      Aphaneramma    NaN   NaN       1131223  251.200   248.90  SJ   
970   1277  Lyrocephaliscus    NaN   NaN       1131869  251.200   248.90  SJ   
971   1276  Lyrocephaliscus    NaN   NaN       1131895  251.200   248.90 

Unnamed: 0,id,genus,county,state,occurrence_no,age_max,age_min,cc,county_original
0,2247,Anconastes,Rio Arriba,New Mexico,146665,305.9,298.90,US,Rio Arriba
1,2246,Broiliellus,Baylor,Texas,147563,283.5,273.01,US,Baylor
2,2245,Broiliellus,Clay,Texas,147564,298.9,286.10,US,Clay
3,2244,Eryops,Archer,Texas,148243,290.1,283.50,US,Archer
4,2243,Aspidosaurus,Archer,Texas,148244,290.1,283.50,US,Archer
...,...,...,...,...,...,...,...,...,...
2242,5,Rastosuchus,Morro do Mulato,PR,N5,266.9,264.30,BR,Morro do Mulato
2243,4,Rastosuchus,Morro do Mulato,PR,N6,266.9,264.30,BR,Morro do Mulato
2244,3,Rastosuchus,Morro do Mulato,PR,N7,266.9,264.30,BR,Morro do Mulato
2245,2,Compsocerops,Buriol,RS,N8,227.0,208.50,BR,Buriol


#### Saving a CSV for Locality Imputation Double Check

In [331]:
# Saving a copy of the dataframe for checking that no data was lost from null imputation
rep_deepdive_cleaned_imputed.head()

Unnamed: 0,id,genus,county,state,occurrence_no,age_max,age_min,cc,county_original
0,4411,Icarosaurus,Hudson,New Jersey,146084,227.0,208.5,US,Hudson
1,4410,Rutiodon,Hudson,New Jersey,146085,227.0,208.5,US,Hudson
2,4409,Kuehneosuchus,Somerset,England,146086,208.5,201.4,UK,Somerset
3,4408,Kuehneosaurus,Somerset,England,146087,208.5,201.4,UK,Somerset
4,4407,Procolophon,Chris Hani,Eastern Cape,147521,251.902,247.2,ZA,Chris Hani


In [332]:
rep_test = rep_deepdive_cleaned_imputed.drop(columns=['age_max', 'age_min'])
rep_test.rename(columns={'county':"Locality"}, inplace=True)

syn_test = syn_deepdive_cleaned_imputed.drop(columns=['age_max', 'age_min'])
syn_test.rename(columns={'county':"Locality"}, inplace=True)

tem_test = tem_deepdive_cleaned_imputed.drop(columns=['age_max', 'age_min'])
tem_test.rename(columns={'county':"Locality"}, inplace=True)

rep_test.head()

Unnamed: 0,id,genus,Locality,state,occurrence_no,cc,county_original
0,4411,Icarosaurus,Hudson,New Jersey,146084,US,Hudson
1,4410,Rutiodon,Hudson,New Jersey,146085,US,Hudson
2,4409,Kuehneosuchus,Somerset,England,146086,UK,Somerset
3,4408,Kuehneosaurus,Somerset,England,146087,UK,Somerset
4,4407,Procolophon,Chris Hani,Eastern Cape,147521,ZA,Chris Hani


In [333]:
# Reordering the columns
cols_ordered = ['id', 'occurrence_no', 'genus', 'Locality', 'county_original', 'state', 'cc']
rep_test = rep_test[cols_ordered]
syn_test = syn_test[cols_ordered]
tem_test = tem_test[cols_ordered]
rep_test

Unnamed: 0,id,occurrence_no,genus,Locality,county_original,state,cc
0,4411,146084,Icarosaurus,Hudson,Hudson,New Jersey,US
1,4410,146085,Rutiodon,Hudson,Hudson,New Jersey,US
2,4409,146086,Kuehneosuchus,Somerset,Somerset,England,UK
3,4408,146087,Kuehneosaurus,Somerset,Somerset,England,UK
4,4407,147521,Procolophon,Chris Hani,Chris Hani,Eastern Cape,ZA
...,...,...,...,...,...,...,...
4406,5,N80,Eohyosaurus,FSTATE,-,FSTATE,ZA
4407,4,N81,Mesosuchus,ECAPE,-,ECAPE,ZA
4408,3,N82,Mesosuchus,ECAPE,-,ECAPE,ZA
4409,2,N83,Mesosuchus,ECAPE,-,ECAPE,ZA


In [334]:
rep_deepdive_cleaned_imputed.isna().sum()

id                    0
genus                 0
county              198
state               212
occurrence_no         0
age_max               0
age_min               0
cc                    8
county_original    1495
dtype: int64

In [335]:
rep_deepdive_cleaned_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4411 entries, 0 to 4410
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               4411 non-null   int64  
 1   genus            4411 non-null   object 
 2   county           4213 non-null   object 
 3   state            4199 non-null   object 
 4   occurrence_no    4411 non-null   object 
 5   age_max          4411 non-null   float64
 6   age_min          4411 non-null   float64
 7   cc               4403 non-null   object 
 8   county_original  2916 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 310.3+ KB


In [336]:
rep_test.isna().sum(), syn_test.isna().sum(), tem_test.isna().sum()

(id                    0
 occurrence_no         0
 genus                 0
 Locality            198
 county_original    1495
 state               212
 cc                    8
 dtype: int64,
 id                    0
 occurrence_no         0
 genus                 0
 Locality            180
 county_original    1115
 state               180
 cc                   27
 dtype: int64,
 id                   0
 occurrence_no        0
 genus                0
 Locality            53
 county_original    510
 state               55
 cc                   0
 dtype: int64)

In [337]:
# These CSVs have the following: nulls in 'Localtiy' are NOT dropped, 
# 'Locality' is the imputed 'county' column
# 'county_original' is the original 'county' column before imputation 
# Special characters have been replaced from 'Locality', 'state', and 'county_original' columns
rep_test.to_csv('reptilia_processed_data/reptilia_locality_check.csv')
syn_test.to_csv('synapsida_processed_data/synapsida_locality_check.csv')
tem_test.to_csv('temnospondyli_processed_data/temnospondyli_locality_check.csv')

### Locality Changes from Tiago
Sent the above *_locality_check.csv's to him, he sent them back with some changes made by hand to the 'locality' column, and some notes on further changes I should make. 
I'll load in his edits below, then make the changes he'd like

#### Merging genus and ages cols

In [338]:
# I need the 'genus', 'age_max', 'age_min' columns back for deepdive, so I'm going to add them back in
# I'm also going to add back in the 'occurrence_no' column, so that I can check to see if the merge worked correctly
rep_selected_cols = rep_deepdive_cleaned_imputed[['id', 'genus', 'age_max', 'age_min', 'occurrence_no']] 
syn_selected_cols = syn_deepdive_cleaned_imputed[['id', 'genus', 'age_max', 'age_min', 'occurrence_no']]
tem_selected_cols = tem_deepdive_cleaned_imputed[['id', 'genus', 'age_max', 'age_min', 'occurrence_no']]

##### Rep

In [339]:
rep_tiago = pd.read_csv('reptilia_locality_check_TRS.csv', index_col=0)
rep_tiago.head(), rep_tiago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4411 entries, 0 to 4410
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   occurrence_no    4411 non-null   object
 1   locality         4268 non-null   object
 2   county_original  2918 non-null   object
 3   state            4213 non-null   object
 4   cc               4410 non-null   object
 5   notes            149 non-null    object
dtypes: object(6)
memory usage: 241.2+ KB


(  occurrence_no      locality county_original         state  cc notes
 0        146084    New Jersey          Hudson    New Jersey  US   NaN
 1        146085    New Jersey          Hudson    New Jersey  US   NaN
 2        146086       England        Somerset       England  UK   NaN
 3        146087       England        Somerset       England  UK   NaN
 4        147521  Eastern Cape      Chris Hani  Eastern Cape  ZA   NaN,
 None)

In [340]:
rep_merged = pd.merge(rep_tiago, rep_selected_cols, left_index=True, right_index=True, how='outer')
rep_merged.info(), rep_selected_cols.info(), rep_tiago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4411 entries, 0 to 4410
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   occurrence_no_x  4411 non-null   object 
 1   locality         4268 non-null   object 
 2   county_original  2918 non-null   object 
 3   state            4213 non-null   object 
 4   cc               4410 non-null   object 
 5   notes            149 non-null    object 
 6   id               4411 non-null   int64  
 7   genus            4411 non-null   object 
 8   age_max          4411 non-null   float64
 9   age_min          4411 non-null   float64
 10  occurrence_no_y  4411 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 542.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4411 entries, 0 to 4410
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             4411 non-null   i

(None, None, None)

In [341]:
# Check that occurrence no's from the original and the returned locality check from Tiago are the same
rep_merged[rep_merged['occurrence_no_x'] != rep_merged['occurrence_no_y']]


Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y
0,146084,New Jersey,Hudson,New Jersey,US,,4411,Icarosaurus,227.000,208.5,146084
1,146085,New Jersey,Hudson,New Jersey,US,,4410,Rutiodon,227.000,208.5,146085
2,146086,England,Somerset,England,UK,,4409,Kuehneosuchus,208.500,201.4,146086
3,146087,England,Somerset,England,UK,,4408,Kuehneosaurus,208.500,201.4,146087
4,147521,Eastern Cape,Chris Hani,Eastern Cape,ZA,,4407,Procolophon,251.902,247.2,147521
...,...,...,...,...,...,...,...,...,...,...,...
4240,1658055,Yunnan,Luxy,Yunnan,CN,,171,Mixosaurus,247.200,242.0,1658055
4241,1658965,Nevada,Pershing,Nevada,US,,170,Benggwigwishingasuchus,247.200,242.0,1658965
4242,1659629,New Mexico,San Miguel,New Mexico,US,,169,Unguinychus,227.000,208.5,1659629
4243,1659690,Baden-Wurttemberg,Calw,Baden-Wurttemberg,DE,,168,Marcianosuchus,247.200,244.7,1659690


In [342]:
syn_tiago = pd.read_csv('synapsida_locality_check_TRS.csv', index_col=0, encoding='latin1')
syn_tiago.head(), syn_tiago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3422 entries, 0 to 3421
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   occurrence_no    3422 non-null   object
 1   locality         3354 non-null   object
 2   county_original  2307 non-null   object
 3   state            3242 non-null   object
 4   cc               3396 non-null   object
 5   notes            51 non-null     object
dtypes: object(6)
memory usage: 187.1+ KB


(  occurrence_no           locality county_original              state  cc  \
 0        147591      West Virginia            Ohio      West Virginia  US   
 1        148250              Texas          Archer              Texas  US   
 2        149619  Rio Grande do Sul     Santa Maria  Rio Grande do Sul  BR   
 3        149745               Utah        San Juan               Utah  US   
 4        149837              Texas          Crosby              Texas  US   
 
   notes  
 0   NaN  
 1   NaN  
 2   NaN  
 3   NaN  
 4   NaN  ,
 None)

##### Syn

In [343]:
syn_merged = pd.merge(syn_tiago, syn_selected_cols, left_index=True, right_index=True, how='outer')
syn_merged.info(), syn_selected_cols.info(), syn_tiago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3422 entries, 0 to 3421
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   occurrence_no_x  3422 non-null   object 
 1   locality         3354 non-null   object 
 2   county_original  2307 non-null   object 
 3   state            3242 non-null   object 
 4   cc               3396 non-null   object 
 5   notes            51 non-null     object 
 6   id               3422 non-null   int64  
 7   genus            3422 non-null   object 
 8   age_max          3422 non-null   float64
 9   age_min          3422 non-null   float64
 10  occurrence_no_y  3422 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 449.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3422 entries, 0 to 3421
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             3422 non-null   i

(None, None, None)

In [344]:
syn_merged[syn_merged['occurrence_no_x'] != syn_merged['occurrence_no_y']]

# Odd issue here where the 'occurrence_no' columns are not the same, even though they are

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y
0,147591,West Virginia,Ohio,West Virginia,US,,3422,Edaphosaurus,303.70,298.900,147591
1,148250,Texas,Archer,Texas,US,,3421,Ophiacodon,290.10,283.500,148250
2,149619,Rio Grande do Sul,Santa Maria,Rio Grande do Sul,BR,,3420,Therioherpeton,237.00,208.500,149619
3,149745,Utah,San Juan,Utah,US,,3419,Ophiacodon,298.90,290.100,149745
4,149837,Texas,Crosby,Texas,US,,3418,Adelobasileus,227.00,208.500,149837
...,...,...,...,...,...,...,...,...,...,...,...
3107,1651624,Zambia,Gwembe,Southern,ZM,,315,Endothiodon,259.51,251.902,1651624
3108,1651627,Zambia,Gwembe,Southern,ZM,,314,Endothiodon,259.51,251.902,1651627
3109,1651628,Zambia,Gwembe,Southern,ZM,,313,Endothiodon,259.51,251.902,1651628
3110,1651629,Zambia,,Muchinga,ZM,,312,Aelurognathus,259.51,251.902,1651629


In [345]:
print(syn_merged['occurrence_no_x'].dtype)
print(syn_merged['occurrence_no_y'].dtype)

object
object


In [346]:
exact_mismatch = syn_merged[syn_merged['occurrence_no_x'].astype(str) != syn_merged['occurrence_no_y'].astype(str)]
print(exact_mismatch[['occurrence_no_x', 'occurrence_no_y']])

Empty DataFrame
Columns: [occurrence_no_x, occurrence_no_y]
Index: []


In [347]:
syn_merged['occurrence_no_x'][0], syn_merged['occurrence_no_y'][0]

('147591', 147591)

In [348]:
syn_merged['occurrence_no_y'] = syn_merged['occurrence_no_y'].astype(str)
syn_merged[syn_merged['occurrence_no_x'] != syn_merged['occurrence_no_y']]

# Fixed 


Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y


##### Tem

In [349]:
tem_tiago = pd.read_csv('temnospondyli_locality_check_TRS.csv', index_col=0)
tem_tiago.head(), tem_tiago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2247 entries, 0 to 2246
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   occurrence_no    2247 non-null   object
 1   locality         2203 non-null   object
 2   county_original  1737 non-null   object
 3   state            2194 non-null   object
 4   cc               2247 non-null   object
 5   notes            44 non-null     object
dtypes: object(6)
memory usage: 122.9+ KB


(  occurrence_no    locality county_original       state  cc notes
 0        146665  New Mexico      Rio Arriba  New Mexico  US   NaN
 1        147563       Texas          Baylor       Texas  US   NaN
 2        147564       Texas            Clay       Texas  US   NaN
 3        148243       Texas          Archer       Texas  US   NaN
 4        148244       Texas          Archer       Texas  US   NaN,
 None)

In [350]:
tem_merged = pd.merge(tem_tiago, tem_selected_cols, left_index=True, right_index=True, how='outer')
tem_merged.info(), tem_selected_cols.info(), tem_tiago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2247 entries, 0 to 2246
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   occurrence_no_x  2247 non-null   object 
 1   locality         2203 non-null   object 
 2   county_original  1737 non-null   object 
 3   state            2194 non-null   object 
 4   cc               2247 non-null   object 
 5   notes            44 non-null     object 
 6   id               2247 non-null   int64  
 7   genus            2247 non-null   object 
 8   age_max          2247 non-null   float64
 9   age_min          2247 non-null   float64
 10  occurrence_no_y  2247 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 275.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2247 non-null   i

(None, None, None)

In [351]:
tem_merged[tem_merged['occurrence_no_x'] != tem_merged['occurrence_no_y']]

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y
0,146665,New Mexico,Rio Arriba,New Mexico,US,,2247,Anconastes,305.90,298.90,146665
1,147563,Texas,Baylor,Texas,US,,2246,Broiliellus,283.50,273.01,147563
2,147564,Texas,Clay,Texas,US,,2245,Broiliellus,298.90,286.10,147564
3,148243,Texas,Archer,Texas,US,,2244,Eryops,290.10,283.50,148243
4,148244,Texas,Archer,Texas,US,,2243,Aspidosaurus,290.10,283.50,148244
...,...,...,...,...,...,...,...,...,...,...,...
2228,1643411,Czechia,,Liberec,CZ,,19,Melanerpeton,298.90,293.52,1643411
2229,1646160,Volgograd,,Volgograd,RU,,18,Dvinosaurus,264.28,259.51,1646160
2230,1646173,Volgograd,,Volgograd,RU,,17,Dvinosaurus,264.28,259.51,1646173
2231,1646274,Volgograd,,Volgograd,RU,,16,Dvinosaurus,259.51,254.14,1646274


In [352]:
tem_merged['occurrence_no_x'][0], tem_merged['occurrence_no_y'][0]

('146665', 146665)

In [353]:
tem_merged['occurrence_no_y'] = tem_merged['occurrence_no_y'].astype(str)
tem_merged[tem_merged['occurrence_no_x'] != tem_merged['occurrence_no_y']]

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y


#### Making 'notes' changes

In [354]:
rep_merged[rep_merged['notes'].notna()]

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y
58,219963,,,,RU,delete,4353,Rhipaeosaurus,266.900,264.28,219963
94,283825,,,,GL,delete,4317,Proganochelys,216.700,213.20,283825
95,283826,,,,GL,delete,4315,Aetosaurus,227.000,208.50,283826
96,283826,,,,GL,delete,4316,Aetosaurus,227.000,208.50,283826
97,283828,,,,GL,delete,4314,Plateosaurus,227.000,208.50,283828
...,...,...,...,...,...,...,...,...,...,...,...
4027,1529048,,,,SJ,what is SJ,384,Omphalosaurus,248.900,247.20,1529048
4070,1549937,,,,IN,delete,341,Kranosaura,227.000,208.50,1549937
4081,1561501,,,,IT,delete,330,Raibliania,237.000,227.00,1561501
4092,1574347,,,,PL,delete,319,Cladeiodon,247.200,242.00,1574347


In [355]:
syn_merged[syn_merged['notes'].notna()]

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y
31,283943,,,,GL,delete,3391,Kuehneotherium,208.5,201.4,283943
32,283944,,,,GL,delete,3390,Brachyzostrodon,208.5,201.4,283944
33,286245,,,,AA,delete,3389,Lystrosaurus,251.902,247.2,286245
34,286246,,,,AA,delete,3388,Lystrosaurus,251.902,247.2,286246
640,807186,,,,AA,delete,2782,Thrinaxodon,251.902,247.2,807186
641,807229,,,,AA,delete,2781,Thrinaxodon,251.902,247.2,807229
801,896180,,,,AA,delete,2621,Kombuisia,251.902,247.2,896180
802,896181,,,,AA,delete,2620,Kombuisia,251.902,247.2,896181
812,896297,,,,,delete,2610,Kannemeyeria,247.2,242.0,896297
813,896299,,,,,delete,2609,Dolichuranus,247.2,242.0,896299


In [356]:
tem_merged[tem_merged['notes'].notna()]

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y
13,219959,,,,RU,delete,2234,Platyoposaurus,266.9,264.28,219959
14,219960,,,,RU,delete,2233,Platyoposaurus,266.9,264.28,219960
15,219961,,,,RU,delete,2232,Melosaurus,266.9,264.28,219961
16,219962,,,,RU,delete,2231,Dvinosaurus,266.9,264.28,219962
768,929804,,,,DE,delete,1479,Mastodonsaurus,242.0,237.0,929804
844,1081097,,,,AA,delete,1403,Parotosuchus,247.2,242.0,1081097
845,1081098,,,,AA,delete,1402,Kryostega,247.2,242.0,1081098
967,1131219,,,,SJ,what is SJ?,1280,Aphaneramma,251.2,248.9,1131219
968,1131223,,,,SJ,what is SJ?,1279,Aphaneramma,251.2,248.9,1131223
970,1131869,,,,SJ,what is SJ?,1277,Lyrocephaliscus,251.2,248.9,1131869


In [357]:
# Common cleaning needed for all 3

# Dropping rows where 'notes' == 'delete'
rep_cleaned = rep_merged[rep_merged['notes'] != 'delete']
tem_cleaned = tem_merged[tem_merged['notes'] != 'delete']
syn_cleaned = syn_merged[syn_merged['notes'] != 'delete']

In [358]:
# Location-specific cleaning
rep_cleaned.loc[rep_cleaned['cc'] == 'SJ', 'locality'] = 'Svalbard'
rep_cleaned.loc[rep_cleaned['cc'] == 'SI', 'locality'] = 'Slovenia'
rep_cleaned.loc[rep_cleaned['cc'] == 'BG', 'locality'] = 'Bulgaria'
tem_cleaned.loc[tem_cleaned['cc'] == 'SJ', 'locality'] = 'Svalbard'
tem_cleaned.loc[tem_cleaned['cc'] == 'KZ', 'locality'] = 'Kazakhstan'
syn_cleaned.loc[syn_cleaned['cc'] == 'LA', 'locality'] = 'Laos'

rep_cleaned[rep_cleaned['notes'].notna()], tem_cleaned[tem_cleaned['notes'].notna()], syn_cleaned[syn_cleaned['notes'].notna()]

(     occurrence_no_x  locality county_original state  cc        notes    id  \
 691           621595  Slovenia             NaN   NaN  SI   what is SI  3720   
 2835         1094847  Bulgaria             NaN   NaN  BG  find out BG  1576   
 3474         1310774  Svalbard             NaN   NaN  SJ   what is SJ   937   
 3875         1419116  Svalbard             NaN   NaN  SJ   what is SJ   536   
 3907         1419483  Svalbard             NaN   NaN  SJ   what is SJ   504   
 3923         1420109  Svalbard             NaN   NaN  SJ   what is SJ   488   
 3925         1420150  Svalbard             NaN   NaN  SJ   what is SJ   486   
 3926         1420151  Svalbard             NaN   NaN  SJ   what is SJ   485   
 3927         1420184  Svalbard             NaN   NaN  SJ   what is SJ   484   
 4027         1529048  Svalbard             NaN   NaN  SJ   what is SJ   384   
 
                    genus  age_max  age_min occurrence_no_y  
 691            Mehliella    237.0    227.0          621

In [359]:
rep_cleaned.info(), tem_cleaned.info(), syn_cleaned.info()  

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4272 entries, 0 to 4410
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   occurrence_no_x  4272 non-null   object 
 1   locality         4272 non-null   object 
 2   county_original  2918 non-null   object 
 3   state            4209 non-null   object 
 4   cc               4272 non-null   object 
 5   notes            10 non-null     object 
 6   id               4272 non-null   int64  
 7   genus            4272 non-null   object 
 8   age_max          4272 non-null   float64
 9   age_min          4272 non-null   float64
 10  occurrence_no_y  4272 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 400.5+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2229 entries, 0 to 2246
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   occurrence_no_x  2229 non-n

(None, None, None)

In [360]:
syn_cleaned[syn_cleaned['locality'].isna()]

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y
22,229268,,,,ZA,,3400,Diictodon,264.28,254.14,229268
2556,1382956,,,,ZA,,866,Cistecephalus,298.9,251.902,1382956
2557,1382957,,,,ZA,,865,Dicynodon,298.9,251.902,1382957
2558,1382958,,,,ZA,,864,Dicynodontoides,298.9,251.902,1382958
2561,1384159,,,,ZA,,861,Cistecephalus,259.51,254.14,1384159
2562,1384160,,,,ZA,,860,Dicynodon,259.51,254.14,1384160
2563,1384161,,,,ZA,,859,Cistecephalus,259.51,254.14,1384161
2569,1384904,,,,ZA,,853,Anomocephalus,273.01,259.51,1384904
2573,1385026,,,,ZA,,849,Diictodon,298.9,251.902,1385026
2574,1385141,,,,ZA,,848,Kannemeyeria,247.2,242.0,1385141


In [361]:
syn_tiago[syn_tiago['locality'].isna()]

Unnamed: 0,occurrence_no,locality,county_original,state,cc,notes
22,229268,,,,ZA,
31,283943,,,,GL,delete
32,283944,,,,GL,delete
33,286245,,,,AA,delete
34,286246,,,,AA,delete
...,...,...,...,...,...,...
2993,1442948,,,,LA,what is LA?
2994,1442949,,,,LA,what is LA?
3054,1567517,,,,ZA,
3062,1587697,,,,AA,delete


In [362]:
syn_cleaned[syn_cleaned['cc'] == 'ZA']

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min,occurrence_no_y
10,229267,Western Cape,Beaufort West,Western Cape,ZA,,3401,Diictodon,259.51,254.14,229267
11,229267,Western Cape,Beaufort West,Western Cape,ZA,,3402,Diictodon,259.51,254.14,229267
12,229267,Western Cape,Beaufort West,Western Cape,ZA,,3403,Diictodon,259.51,254.14,229267
13,229267,Western Cape,Beaufort West,Western Cape,ZA,,3404,Diictodon,259.51,254.14,229267
14,229267,Western Cape,Beaufort West,Western Cape,ZA,,3405,Diictodon,259.51,254.14,229267
...,...,...,...,...,...,...,...,...,...,...,...
3299,N388,Northern Cape,-,Northern Cape,ZA,,123,Endothiodon,259.50,254.10,N388
3300,N389,Free State,-,Free State,ZA,,122,Lystrosaurus,251.90,251.20,N389
3301,N390,Free State,-,Free State,ZA,,121,Lystrosaurus,251.90,251.20,N390
3302,N391,Eastern Cape,-,Eastern Cape,ZA,,120,Robertia,264.30,259.50,N391


In [363]:
# Saving off the cleaned data for Tiago to check again
rep_cleaned_filtered = rep_cleaned.drop(columns=['occurrence_no_y'])
syn_cleaned_filtered = syn_cleaned.drop(columns=['occurrence_no_y'])
tem_cleaned_filtered = tem_cleaned.drop(columns=['occurrence_no_y'])

rep_cleaned_filtered.to_csv('reptilia_processed_data/reptilia_locality_check_TRS_CLEANED.csv')
syn_cleaned_filtered.to_csv('synapsida_processed_data/synapsida_locality_check_TRS_CLEANED.csv')
tem_cleaned_filtered.to_csv('temnospondyli_processed_data/temnospondyli_locality_check_TRS_CLEANED.csv')

PermissionError: [Errno 13] Permission denied: 'reptilia_processed_data/reptilia_locality_check_TRS_CLEANED.csv'

In [366]:
rep_cleaned_filtered.tail(20)

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,id,genus,age_max,age_min
4391,N63,Rio Grande do Sul,Buriol,Rio Grande do Sul,BR,,20,Hyperodapedon,227.0,208.5
4392,N64,Rio Grande do Sul,Buriol,Rio Grande do Sul,BR,,19,Hyperodapedon,227.0,208.5
4393,N67,Rio Grande do Sul,Vila Kennedy,Rio Grande do Sul,BR,,18,Hyperodapedon,227.0,208.5
4394,N68,Rio Grande do Sul,-,Rio Grande do Sul,BR,,17,Prestosuchus,242.0,237.0
4395,N69,Rio Grande do Sul,Sao Jose,Rio Grande do Sul,BR,,16,Hyperodapedon,227.0,208.5
4396,N70,Rio Grande do Sul,Sao Jose,Rio Grande do Sul,BR,,15,Hyperodapedon,227.0,208.5
4397,N71,Rio Grande do Sul,Cerrito,Rio Grande do Sul,BR,,14,Hyperodapedon,227.0,208.5
4398,N72,Rio Grande do Sul,Vale do Sol,Rio Grande do Sul,BR,,13,Hyperodapedon,227.0,208.5
4399,N73,Rio Grande do Sul,-,Rio Grande do Sul,BR,,12,Hyperodapedon,227.0,208.5
4400,N74,Rio Grande do Sul,-,Rio Grande do Sul,BR,,11,Prestosuchus,242.0,237.0


In [None]:
import pandas as pd

# Assuming your dataframe is called 'df' and the column is named 'id'
# If not, replace 'df' and 'id' with your actual dataframe and column names

# Sort the dataframe by 'id' in descending order
df = df.sort_values('id', ascending=False).reset_index(drop=True)

# Create a function to find skipped numbers
def find_skipped_numbers(df):
    skipped = []
    expected = 4000  # Start with the highest expected value
    
    for index, row in df.iterrows():
        current = row['id']
        if current != expected:
            skipped.extend(range(expected, current, -1))
        expected = current - 1
    
    return skipped

# Find skipped numbers
skipped_numbers = find_skipped_numbers(df)

print("Skipped numbers:", skipped_numbers)
print("Total skipped numbers:", len(skipped_numbers))

# If you want to find the gaps (ranges of skipped numbers)
from itertools import groupby
from operator import itemgetter

def find_gaps(skipped):
    gaps = []
    for k, g in groupby(enumerate(skipped), lambda ix: ix[0] - ix[1]):
        group = list(map(itemgetter(1), g))
        gaps.append((group[0], group[-1]))
    return gaps

gaps = find_gaps(skipped_numbers)
print("\nGaps in the sequence:")
for start, end in gaps:
    if start == end:
        print(f"Single number gap: {start}")
    else:
        print(f"Gap from {start} to {end}")

### Locality Changes from Tiago Pt. 2

In [279]:
rep_cleaned_filtered.head()

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,genus,age_max,age_min
0,146084,New Jersey,Hudson,New Jersey,US,,Icarosaurus,227.0,208.5
1,146085,New Jersey,Hudson,New Jersey,US,,Rutiodon,227.0,208.5
2,146086,England,Somerset,England,UK,,Kuehneosuchus,208.5,201.4
3,146087,England,Somerset,England,UK,,Kuehneosaurus,208.5,201.4
4,147521,Eastern Cape,Chris Hani,Eastern Cape,ZA,,Procolophon,251.902,247.2


In [273]:
rep_tiago_2 = pd.read_csv('reptilia_locality_check_final.csv')
syn_tiago_2 = pd.read_csv('synapsida_locality_check_final.csv')
tem_tiago_2 = pd.read_csv('temnospondyli_locality_check_final.csv')

In [272]:
len(rep_cleaned_filtered) - len(rep_tiago_2)

6

In [274]:
len(syn_cleaned_filtered) - len(syn_tiago_2)

18

In [275]:
len(tem_cleaned_filtered) - len(tem_tiago_2)

0

In [277]:
rep_cleaned_filtered[rep_cleaned_filtered['locality'].isna()]

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,genus,age_max,age_min


In [278]:
syn_cleaned_filtered[syn_cleaned_filtered['locality'].isna()]

Unnamed: 0,occurrence_no_x,locality,county_original,state,cc,notes,genus,age_max,age_min
22,229268,,,,ZA,,Diictodon,264.28,254.14
2556,1382956,,,,ZA,,Cistecephalus,298.9,251.902
2557,1382957,,,,ZA,,Dicynodon,298.9,251.902
2558,1382958,,,,ZA,,Dicynodontoides,298.9,251.902
2561,1384159,,,,ZA,,Cistecephalus,259.51,254.14
2562,1384160,,,,ZA,,Dicynodon,259.51,254.14
2563,1384161,,,,ZA,,Cistecephalus,259.51,254.14
2569,1384904,,,,ZA,,Anomocephalus,273.01,259.51
2573,1385026,,,,ZA,,Diictodon,298.9,251.902
2574,1385141,,,,ZA,,Kannemeyeria,247.2,242.0


# Final DeepDive Processing

#### Dropping Remaining 'County' Null Rows

In [166]:
rep_cleaned_filtered.isna().sum(), syn_cleaned_filtered.isna().sum(), tem_cleaned_filtered.isna().sum()

(occurrence_no_x       0
 locality              0
 county_original    1354
 state                63
 cc                    0
 notes              4262
 genus                 0
 age_max               0
 age_min               0
 dtype: int64,
 occurrence_no_x       0
 locality             18
 county_original    1066
 state               132
 cc                    0
 notes              3371
 genus                 0
 age_max               0
 age_min               0
 dtype: int64,
 occurrence_no_x       0
 locality              0
 county_original     492
 state                35
 cc                    0
 notes              2203
 genus                 0
 age_max               0
 age_min               0
 dtype: int64)

In [168]:
# Dropping 'state', 'occurrence_no', 'county_original', and 'cc' columns
rep_deepdive_final = rep_cleaned_filtered.drop(columns=['state', 'occurrence_no_x', 'county_original', 'cc', 'notes'])
syn_deepdive_final = syn_cleaned_filtered.drop(columns=['state', 'occurrence_no_x', 'county_original', 'cc', 'notes'])
tem_deepdive_final = tem_cleaned_filtered.drop(columns=['state', 'occurrence_no_x', 'county_original', 'cc', 'notes'])

# Creating a new 'Area' column filled with 1's
rep_deepdive_final['Area'] = 1
syn_deepdive_final['Area'] = 1
tem_deepdive_final['Area'] = 1

# Renaming columns
rep_deepdive_final.rename(columns={'genus': 'Taxon', 'locality': 'Locality', 'age_max': 'MaxAge', 'age_min':'MinAge'}, inplace=True)
syn_deepdive_final.rename(columns={'genus': 'Taxon', 'locality': 'Locality', 'age_max': 'MaxAge', 'age_min':'MinAge'}, inplace=True)
tem_deepdive_final.rename(columns={'genus': 'Taxon', 'locality': 'Locality', 'age_max': 'MaxAge', 'age_min':'MinAge'}, inplace=True)


rep_deepdive_final.isna().sum(), syn_deepdive_final.isna().sum(), tem_deepdive_final.isna().sum()

# Syn still has 18 locality nulls, waiting on Tiago's instructions regarding those

(Locality    0
 Taxon       0
 MaxAge      0
 MinAge      0
 Area        0
 dtype: int64,
 Locality    18
 Taxon        0
 MaxAge       0
 MinAge       0
 Area         0
 dtype: int64,
 Locality    0
 Taxon       0
 MaxAge      0
 MinAge      0
 Area        0
 dtype: int64)

In [169]:
print(rep_deepdive_final.head()), print(syn_deepdive_final.head()), print(tem_deepdive_final.head())

       Locality          Taxon   MaxAge  MinAge  Area
0    New Jersey    Icarosaurus  227.000   208.5     1
1    New Jersey       Rutiodon  227.000   208.5     1
2       England  Kuehneosuchus  208.500   201.4     1
3       England  Kuehneosaurus  208.500   201.4     1
4  Eastern Cape    Procolophon  251.902   247.2     1
            Locality           Taxon  MaxAge  MinAge  Area
0      West Virginia    Edaphosaurus   303.7   298.9     1
1              Texas      Ophiacodon   290.1   283.5     1
2  Rio Grande do Sul  Therioherpeton   237.0   208.5     1
3               Utah      Ophiacodon   298.9   290.1     1
4              Texas   Adelobasileus   227.0   208.5     1
     Locality         Taxon  MaxAge  MinAge  Area
0  New Mexico    Anconastes   305.9  298.90     1
1       Texas   Broiliellus   283.5  273.01     1
2       Texas   Broiliellus   298.9  286.10     1
3       Texas        Eryops   290.1  283.50     1
4       Texas  Aspidosaurus   290.1  283.50     1


(None, None, None)

In [209]:
rep_deepdive_final.info(), syn_deepdive_final.info(), tem_deepdive_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4272 entries, 0 to 4410
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Locality  4272 non-null   object 
 1   Taxon     4272 non-null   object 
 2   MaxAge    4272 non-null   float64
 3   MinAge    4272 non-null   float64
 4   Area      4272 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 200.2+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3373 entries, 0 to 3421
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Locality  3355 non-null   object 
 1   Taxon     3373 non-null   object 
 2   MaxAge    3373 non-null   float64
 3   MinAge    3373 non-null   float64
 4   Area      3373 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 158.1+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2229 entries, 0 to 2246
Data columns (total 5 columns):
 #   Col

(None, None, None)

In [171]:
rep_deepdive_final.describe(), syn_deepdive_final.describe(), tem_deepdive_final.describe()

(            MaxAge       MinAge    Area
 count  4272.000000  4272.000000  4272.0
 mean    241.207976   231.957375     1.0
 std      23.323699    25.440626     0.0
 min     201.400000   199.500000     1.0
 25%     227.000000   208.500000     1.0
 50%     239.700000   237.000000     1.0
 75%     254.140000   251.902000     1.0
 max     298.900000   293.520000     1.0,
             MaxAge       MinAge    Area
 count  3373.000000  3373.000000  3373.0
 mean    258.672438   251.142978     1.0
 std      18.218130    19.015578     0.0
 min     208.500000   201.400000     1.0
 25%     251.902000   247.200000     1.0
 50%     259.510000   254.140000     1.0
 75%     264.280000   259.510000     1.0
 max     309.800000   307.400000     1.0,
             MaxAge       MinAge    Area
 count  2229.000000  2229.000000  2229.0
 mean    270.385148   263.041449     1.0
 std      26.805859    28.104459     0.0
 min     208.500000   201.400000     1.0
 25%     251.200000   247.200000     1.0
 50%     264.2

In [176]:
def descriptions(df):
    print("Total number of occurrences: " + str(len(df)))
    print("Unique taxa count: " + str(len(df['Taxon'].value_counts())))
    print("Unique localities count: " + str(len(df['Locality'].value_counts())))

In [177]:
descriptions(rep_deepdive_final)

Total number of occurrences: 4272
Unique taxa count: 689
Unique localities count: 154


In [178]:
descriptions(syn_deepdive_final)

Total number of occurrences: 3373
Unique taxa count: 510
Unique localities count: 90


In [179]:
descriptions(tem_deepdive_final)

Total number of occurrences: 2229
Unique taxa count: 262
Unique localities count: 108


In [180]:
# Saving final deepdive input datasets
rep_deepdive_final.to_csv('reptilia_processed_data/reptilia_deepdive_final.csv', index=False)
syn_deepdive_final.to_csv('synapsida_processed_data/synapsida_deepdive_final.csv', index=False)
tem_deepdive_final.to_csv('temnospondyli_processed_data/temnospondyli_deepdive_final.csv', index=False)