In [15]:
import pandas as pd
import numpy as np
import os

SOURCE = "sampledata/ebd_US-AL-101_202204_202204_relApr-2022.txt"
OUT_DIR = "output"

# Important column names. Note these are the final forms, after renaming to remove spaces:
COL_TAX_CATEGORY = 'CATEGORY'
COL_COMMON_NAME = 'COMMON_NAME'
COL_SCI_NAME = 'SCIENTIFIC_NAME'
COL_LAT = 'LATITUDE'
COL_LONG = 'LONGITUDE'
COL_DATE = 'OBSERVATION_DATE'
FINAL_CARE_COLUMNS = [COL_COMMON_NAME, COL_SCI_NAME, COL_LAT, COL_LONG, COL_DATE]
NORMALIZE_COLUMNS = [COL_COMMON_NAME, COL_SCI_NAME]

def normalize_form(df, column_names):
    value_dfs = {}
    normalized_df = df.copy()

    for column_name in column_names:
        print(f"Extracing values for column {column_name}")
        
        # Build a table for the extracted values, giving each unique value an ID number
        values = df[column_name].unique()
        ids = np.arange(1, len(values)+1)
        value_df = pd.DataFrame({'id': ids, column_name: values})
        value_dfs[column_name] = value_df
    
        # Replace the literal values with their ID in the original dataframe
        for i in range(0, len(values)):
            value = values[i]
            id = ids[i]
            normalized_df[column_name] = normalized_df[column_name].replace({value: id})
    return normalized_df, value_dfs

# Read the data
raw_df = pd.read_csv(SOURCE, sep='\t')

# Drop all sightings categories by anything other than species (just for simplification)
df = raw_df.loc[raw_df[COL_TAX_CATEGORY] == 'species']

# Change a few column names to avoid spaces in names
df = df.rename(columns={COL_COMMON_NAME.replace('_', ' '): COL_COMMON_NAME})
df = df.rename(columns={COL_SCI_NAME.replace('_', ' '): COL_SCI_NAME})
df = df.rename(columns={COL_DATE.replace('_', ' '): COL_DATE})

# Remove extraneous columns
df = df.loc[:, FINAL_CARE_COLUMNS ]

# Get the normalized dataframes
(ndf, vdfs) = normalize_form(df, NORMALIZE_COLUMNS)

# Save all the dataframes as CSV
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

out_path = os.path.join(OUT_DIR, 'normalized.csv')
print(f"Saving main data: {out_path}")
ndf.to_csv(out_path)
for c in NORMALIZE_COLUMNS:
    out_path = os.path.join(OUT_DIR, f'{c}.csv')
    print(f"Saving {c} data: {out_path}")
    vdfs[c].to_csv(out_path)


Extracing values for column COMMON_NAME
Extracing values for column SCIENTIFIC_NAME
Saving main data: output\normalized.csv
Saving COMMON_NAME data: output\COMMON_NAME.csv
Saving SCIENTIFIC_NAME data: output\SCIENTIFIC_NAME.csv
