# Example populate the data

## Load libs

In [1]:
import sqlite3
import pandas as pd
from pathlib import Path

## Path

In [2]:
DB_PATH  = "penguin_dataset.db"
CSV_PATH = "data/penguins_lter.csv" 

## Load data

In [3]:
df = pd.read_csv(CSV_PATH)

In [4]:
# Normalize column names (lowercase, underscores)
df.columns = [c.strip().lower().replace(" ", "_").replace("(", "").replace(")", "") for c in df.columns]

## Helper functions to populate the DB

In [5]:
# Helper: first available column from a list of possible names
def pick(df, *names, default=None):
    for n in names:
        if n in df.columns:
            return df[n]
    return pd.Series([default]*len(df))

In [6]:
# Build a normalized view with the fields we need
norm = pd.DataFrame({
    "species_name":        pick(df, "species", "species_name"),
    "island":              pick(df, "island"),
    "region":              pick(df, "region", default="Palmer Archipelago"),
    "study_number":        pick(df, "year", "studyname", "study_number", default="NA"),
    "individual_id":       pick(df, "individual_id", "individualid"),
    # numeric measures (support both bill_ and culmen_ names)
    "culmen_len":          pd.to_numeric(pick(df, "bill_length_mm", "culmen_length_mm", "culmen_len"), errors="coerce"),
    "culmen_depth":        pd.to_numeric(pick(df, "bill_depth_mm", "culmen_depth_mm", "culmen_depth"), errors="coerce"),
    "flipper_len":         pd.to_numeric(pick(df, "flipper_length_mm", "flipper_len"), errors="coerce"),
    "body_mass_g":         pd.to_numeric(pick(df, "body_mass_g", "body_mass"), errors="coerce"),
    "sex":                 pick(df, "sex"),
    "sample_number":       pick(df, "sample_number", "smaple_number"),
    "comments":            pick(df, "comments", default=None),
    # if egg/stage exists:
    "egg":                 pick(df, "egg", "clutch_completion", default=None),
})

In [7]:
# --- 2) Open DB ---
conn = sqlite3.connect(DB_PATH)
conn.execute("PRAGMA foreign_keys = ON;")

<sqlite3.Cursor at 0x14e7ff15e40>

In [8]:
# Small util helpers
def one(conn, q, params=()):
    cur = conn.execute(q, params); row = cur.fetchone(); cur.close()
    return row[0] if row else None

def upsert_study(conn, study_number, region, island):
    sid = one(conn, "SELECT id FROM study WHERE study_number=? AND region=? AND island=?;",
              (str(study_number), str(region), str(island)))
    if sid:
        return sid
    conn.execute("INSERT INTO study (study_number, region, island) VALUES (?,?,?);",
                 (str(study_number), str(region), str(island)))
    return one(conn, "SELECT id FROM study WHERE study_number=? AND region=? AND island=?;",
               (str(study_number), str(region), str(island)))

def upsert_species(conn, species_name, study_number):
    spid = one(conn, "SELECT id FROM species WHERE species_name=? AND study_number=?;",
               (str(species_name), int(study_number)))
    if spid:
        return spid
    conn.execute("INSERT INTO species (species_name, study_id) VALUES (?,?);",
                 (str(species_name), int(study_number)))
    return one(conn, "SELECT id FROM species WHERE species_name=? AND study_number=?;",
               (str(species_name), int(study_number)))

def insert_penguin(conn, row, species_id):
    # Adjust the column names in the INSERT to match your generated table exactly.
    # If your generator used different names (e.g., "smaple_number"), change below accordingly.
    conn.execute("""
        INSERT INTO penguin
        (species_id, individual_id, sex, culmen_len, culmen_depth, flipper_len, body_mass_g, comments, egg, sample_number)
        VALUES (?,?,?,?,?,?,?,?,?,?);
    """, (
        int(species_id),
        (None if pd.isna(row["individual_id"]) else str(row["individual_id"])),
        (None if pd.isna(row["sex"])          else str(row["sex"])),
        (None if pd.isna(row["culmen_len"])   else float(row["culmen_len"])),
        (None if pd.isna(row["culmen_depth"]) else float(row["culmen_depth"])),
        (None if pd.isna(row["flipper_len"])  else float(row["flipper_len"])),
        (None if pd.isna(row["body_mass_g"])  else float(row["body_mass_g"])),
        (None if pd.isna(row["comments"])     else str(row["comments"])),
        (None if pd.isna(row["egg"])          else str(row["egg"])),
        (None if pd.isna(row["sample_number"])else str(row["sample_number"])),
    ))


In [None]:
norm.columns

Index(['species_name', 'island', 'region', 'study_number', 'individual_id',
       'culmen_len', 'culmen_depth', 'flipper_len', 'body_mass_g', 'sex',
       'sample_number', 'comments', 'egg'],
      dtype='object')

: 

In [9]:
inserted = 0
with conn:
    for _, r in norm.iterrows():
        sid = upsert_study(conn, r["study_number"], r["region"], r["island"])
        spid = upsert_species(conn, r["species_name"], sid)
        insert_penguin(conn, r, spid)
        inserted += 1

print(f"Inserted penguins: {inserted}")
# quick sanity query
total_penguins = one(conn, "SELECT COUNT(*) FROM penguin;")
print("Total penguins in DB now:", total_penguins)
conn.close()

OperationalError: no such column: study_number