In [71]:
import pandas as pd
import numpy as np

SOURCE = "sampledata/ebd_US-AL-101_202204_202204_relApr-2022.txt"

# Important column names
COL_TAX_CATEGORY = 'CATEGORY'
COL_COMMON_NAME = 'COMMON NAME'
COL_SCI_NAME = 'SCIENTIFIC NAME'
COL_LAT = 'LATITUDE'
COL_LONG = 'LONGITUDE'
COL_DATE = 'OBSERVATION DATE'
FINAL_CARE_COLUMNS = [COL_COMMON_NAME, COL_SCI_NAME, COL_LAT, COL_LONG, COL_DATE]

def normalize_form(df, column_name):
    # Build a table for the extracted values, giving each unique value an ID number
    values = df[column_name].unique()
    ids = np.arange(1, len(values)+1)
    value_df = pd.DataFrame({'id': ids, column_name: values})

    # Replace the literal values with their ID in the original dataframe
    result = df.copy()
    for i in range(0, len(values)):
        value = values[i]
        id = ids[i]
        result[column_name] = result[column_name].replace({value: id})
    return result, value_df

# Read the data
raw_df = pd.read_csv(SOURCE, sep='\t')

# Drop all sightings categories by anything other than species (just for simplification)
df = raw_df.loc[raw_df[COL_TAX_CATEGORY] == 'species']

# Remove extraneous columns
df = df.loc[:, FINAL_CARE_COLUMNS ]

(ndf, vdf) = normalize_form(df, COL_COMMON_NAME)
ndf, vdf


(      COMMON NAME        SCIENTIFIC NAME   LATITUDE  LONGITUDE  \
 0               1  Corvus brachyrhynchos  32.176196 -86.352121   
 1               1  Corvus brachyrhynchos  32.358285 -86.454432   
 2               1  Corvus brachyrhynchos  32.310409 -86.101011   
 3               1  Corvus brachyrhynchos  32.105727 -86.024669   
 4               1  Corvus brachyrhynchos  32.345618 -86.032910   
 ...           ...                    ...        ...        ...   
 1394          106     Setophaga coronata  32.309383 -86.181432   
 1395          106     Setophaga coronata  32.369819 -86.180017   
 1396          107       Vireo flavifrons  32.335154 -86.037366   
 1397          107       Vireo flavifrons  32.358285 -86.454432   
 1398          108     Setophaga dominica  32.054564 -86.219386   
 
      OBSERVATION DATE  
 0          2022-04-29  
 1          2022-04-26  
 2          2022-04-23  
 3          2022-04-28  
 4          2022-04-17  
 ...               ...  
 1394       2022-04