# 1. Chicken farm schema

Created a simple schema in a google sheet [here](https://docs.google.com/spreadsheets/d/1Ir5d-4LADww_FJOt9VXVKb_TuFAyq2BsYR2F7R3FYl0/edit#gid=0)

# 2. Generating data

In [55]:
import pandas as pd
import random
import duckdb 
from faker import Faker 

fake = Faker()

# number of chickens/eggs to generate
n = 1000

def generate_ids(num_chickens):
    identifiers = set() #ensure uniqueness in an empty set
    while len(identifiers) < num_chickens:
        identifier = random.randint(10000000, 99999999)
        identifiers.add(identifier)
    return list(identifiers)

chickens = generate_ids(n)
chicken_df = pd.DataFrame(chickens, columns=['egg_id'])

In [56]:
# because we need to create lineage, i'm creating 'generations', where generation 3 is the most recent set of chickens
# gen 3's parents are gen 2, and gen 2's parents are gen 1
# assuming hens give birth to a lot of chickens, so setting a 10-30-60 ratio across gens 1 thru 3

num_gen1 = int(n * 0.1)
num_gen2 = int(n * 0.3)
num_gen3 = n - (num_gen1 + num_gen2)

generation_list = [1] * num_gen1 + [2] * num_gen2 + [3] * num_gen3
chicken_df['generation'] = generation_list

chicken_df

Unnamed: 0,egg_id,generation
0,34146305,1
1,14129157,1
2,80947206,1
3,13824010,1
4,90796045,1
...,...,...
995,72282102,3
996,28332023,3
997,20555769,3
998,87932922,3


In [57]:
# function to assign rooster or hen to each chicken
def assign_gender_with_split(chickens, gender_split, gender1, gender2):
    half = len(chickens) // 2
    chickens.loc[chickens.index[:half], 'gender'] = gender1
    chickens.loc[chickens.index[half:], 'gender'] = gender2
    return chickens

# ensure 50-50 split between gender within each generation
gen1_df = chicken_df[chicken_df['generation'] == 1].copy()
gen2_df = chicken_df[chicken_df['generation'] == 2].copy()
gen3_df = chicken_df[chicken_df['generation'] == 3].copy()

gen1_df = assign_gender_with_split(gen1_df, 0.5, 'Rooster', 'Hen')
gen2_df = assign_gender_with_split(gen2_df, 0.5, 'Rooster', 'Hen')
gen3_df = assign_gender_with_split(gen3_df, 0.5, 'Rooster', 'Hen')

chicken_df = pd.concat([gen1_df, gen2_df, gen3_df], ignore_index=True)
chicken_df

Unnamed: 0,egg_id,generation,gender
0,34146305,1,Rooster
1,14129157,1,Rooster
2,80947206,1,Rooster
3,13824010,1,Rooster
4,90796045,1,Rooster
...,...,...,...
995,72282102,3,Hen
996,28332023,3,Hen
997,20555769,3,Hen
998,87932922,3,Hen


In [58]:
# let's create a lookup table for the incubation hall

incubation_hall_location_ids = list(range(1, 31))
is_near_window = [True if i <= 10 else False for i in incubation_hall_location_ids]

data = {
    "incubation_hall_location_id": incubation_hall_location_ids,
    "is_near_window": is_near_window
}

incubation_hall = pd.DataFrame(data)


In [59]:
# technically this is making it possible for a mother hen to have multiple incubation locations, but i'm assuming that's fine
chicken_df['incubation_hall_location_id'] = [random.choice(incubation_hall_location_ids) if generation != 1 else None for generation in chicken_df['generation']]

# check to make sure that roughly 33% of chickens were incubated near a window
duckdb.query("""
with counts as (
    select incubation_hall_location_id, count(*) as count from chicken_df where generation != 1 group by 1 order by 1
)

, windowed_sums as (
select
    *,
    sum(count) over (order by incubation_hall_location_id rows between unbounded preceding and current row) as cumulative_count,
    sum(count) over (order by incubation_hall_location_id rows between unbounded preceding and unbounded following) as total_count
from counts
)

select
    *,
    cumulative_count::float / total_count::float as cumulative_percent
from windowed_sums
 """).to_df()


Unnamed: 0,incubation_hall_location_id,count,cumulative_count,total_count,cumulative_percent
0,1.0,28,28.0,900.0,0.031111
1,2.0,33,61.0,900.0,0.067778
2,3.0,23,84.0,900.0,0.093333
3,4.0,28,112.0,900.0,0.124444
4,5.0,26,138.0,900.0,0.153333
5,6.0,25,163.0,900.0,0.181111
6,7.0,29,192.0,900.0,0.213333
7,8.0,50,242.0,900.0,0.268889
8,9.0,31,273.0,900.0,0.303333
9,10.0,27,300.0,900.0,0.333333


In [60]:
# validate that the geneder split function works correctly
duckdb.query("select generation, gender, count(*) as count from chicken_df group by 1,2").to_df()

Unnamed: 0,generation,gender,count
0,1,Rooster,50
1,1,Hen,50
2,2,Rooster,150
3,2,Hen,150
4,3,Rooster,300
5,3,Hen,300


In [61]:
# using the faker library to generate full names for each chicken dependent on gender
chicken_df['name'] = [
    f"{fake.first_name_male()} {fake.last_name()}" if gender == 'Rooster'
    else f"{fake.first_name_female()} {fake.last_name()}"
    for gender in chicken_df['gender']
]

chicken_df

Unnamed: 0,egg_id,generation,gender,incubation_hall_location_id,name
0,34146305,1,Rooster,,Stephen Garcia
1,14129157,1,Rooster,,Thomas Hill
2,80947206,1,Rooster,,Kent Oconnor
3,13824010,1,Rooster,,Steven Simmons
4,90796045,1,Rooster,,Donald Lee
...,...,...,...,...,...
995,72282102,3,Hen,16.0,Felicia Martinez
996,28332023,3,Hen,7.0,Regina Fernandez
997,20555769,3,Hen,30.0,Judith Morales
998,87932922,3,Hen,19.0,Diana Mckay


In [44]:
# generate random feather colors for each chicken
def random_feather_color():
    colors = ['white', 'black', 'brown', 'red', 'gray', 'gold']
    return random.choice(colors)

chicken_df['feather_color'] = [random_feather_color() for i in range(len(chicken_df))]

chicken_df

Unnamed: 0,egg_id,generation,gender,incubation_hall_location_id,name,feather_color
0,90327043,1,Rooster,,Mark Hernandez,white
1,25692215,1,Rooster,,Victor Wells,gold
2,75991113,1,Rooster,,Jeffrey Walls,gold
3,35881049,1,Rooster,,Ryan Green,brown
4,54519912,1,Rooster,,Joseph Anderson,black
...,...,...,...,...,...,...
995,48175098,3,Hen,8.0,Michelle Chandler,black
996,93218811,3,Hen,18.0,Mary Pope,gray
997,18358269,3,Hen,7.0,Rachel Berry,red
998,57497598,3,Hen,2.0,Mary George,black


In [62]:
# using a free API to get the billboard top 10 songs from a given date
# import requests

# url = "https://billboard-api2.p.rapidapi.com/hot-100"

# querystring = {"date":"2023-03-01","range":"1-10"}

# headers = {
#     # i know this is bad practice, but this is a free API key capped at 30 reqs/month
# 	"X-RapidAPI-Key": "79daae12cdmsh6e4351700c61e60p18544ejsnb1b1f2f8b40d",
# 	"X-RapidAPI-Host": "billboard-api2.p.rapidapi.com"
# }

# response = requests.request("GET", url, headers=headers, params=querystring)

top_10 = response.json()

def create_song_list(json_data):
    content = json_data['content']
    song_list = [f"{song['title']} by {song['artist']}" for song in content.values()]
    return song_list

song_list = create_song_list(top_10)

chicken_df['favorite_song'] = [random.choice(song_list) for i in range(len(chicken_df))]

chicken_df


Unnamed: 0,egg_id,generation,gender,incubation_hall_location_id,name,favorite_song
0,34146305,1,Rooster,,Stephen Garcia,"Boy's A Liar, Pt. 2 by PinkPantheress & Ice Spice"
1,14129157,1,Rooster,,Thomas Hill,I'm Good (Blue) by David Guetta & Bebe Rexha
2,80947206,1,Rooster,,Kent Oconnor,Flowers by Miley Cyrus
3,13824010,1,Rooster,,Steven Simmons,"Boy's A Liar, Pt. 2 by PinkPantheress & Ice Spice"
4,90796045,1,Rooster,,Donald Lee,Flowers by Miley Cyrus
...,...,...,...,...,...,...
995,72282102,3,Hen,16.0,Felicia Martinez,Kill Bill by SZA
996,28332023,3,Hen,7.0,Regina Fernandez,Unholy by Sam Smith & Kim Petras
997,20555769,3,Hen,30.0,Judith Morales,Kill Bill by SZA
998,87932922,3,Hen,19.0,Diana Mckay,Kill Bill by SZA


In [63]:
# function to get parent hens and roosters for each generation
def get_parents(chicken_df, current_generation):
    parents_df = chicken_df[chicken_df['generation'] == current_generation - 1]
    hens = parents_df[parents_df['gender'] == 'Hen']['egg_id'].tolist()
    roosters = parents_df[parents_df['gender'] == 'Rooster']['egg_id'].tolist()
    return hens, roosters

# function to assign parent IDs to each chicken
def assign_parents(chicken_df, generation, hens, roosters):
    chicken_df.loc[chicken_df['generation'] == generation, 'parent_hen_id'] = [
        random.choice(hens) for i in range(len(chicken_df[chicken_df['generation'] == generation]))]
    chicken_df.loc[chicken_df['generation'] == generation, 'parent_rooster_id'] = [
        random.choice(roosters) for i in range(len(chicken_df[chicken_df['generation'] == generation]))]

# get parent hens and roosters for each generation
gen2_hen_parents, gen2_rooster_parents = get_parents(chicken_df, 2)
gen3_hen_parents, gen3_rooster_parents = get_parents(chicken_df, 3)

# assign parent IDs for chickens in generation 2 and 3
assign_parents(chicken_df, 2, gen2_hen_parents, gen2_rooster_parents)
assign_parents(chicken_df, 3, gen3_hen_parents, gen3_rooster_parents)

chicken_df

Unnamed: 0,egg_id,generation,gender,incubation_hall_location_id,name,favorite_song,parent_hen_id,parent_rooster_id
0,34146305,1,Rooster,,Stephen Garcia,"Boy's A Liar, Pt. 2 by PinkPantheress & Ice Spice",,
1,14129157,1,Rooster,,Thomas Hill,I'm Good (Blue) by David Guetta & Bebe Rexha,,
2,80947206,1,Rooster,,Kent Oconnor,Flowers by Miley Cyrus,,
3,13824010,1,Rooster,,Steven Simmons,"Boy's A Liar, Pt. 2 by PinkPantheress & Ice Spice",,
4,90796045,1,Rooster,,Donald Lee,Flowers by Miley Cyrus,,
...,...,...,...,...,...,...,...,...
995,72282102,3,Hen,16.0,Felicia Martinez,Kill Bill by SZA,45044520.0,79933740.0
996,28332023,3,Hen,7.0,Regina Fernandez,Unholy by Sam Smith & Kim Petras,91533954.0,57446627.0
997,20555769,3,Hen,30.0,Judith Morales,Kill Bill by SZA,89231986.0,56691118.0
998,87932922,3,Hen,19.0,Diana Mckay,Kill Bill by SZA,56451646.0,27271384.0


In [64]:
# long function to find first cousins for a given chicken
def find_cousins(chicken_id, df):
    chicken_df = df.loc[df['egg_id'] == chicken_id]
    if chicken_df.empty:
        return []

    chicken_df = chicken_df.iloc[0]
    parent_hen = chicken_df['parent_hen_id']
    parent_rooster = chicken_df['parent_rooster_id']

    parent_hen_df = df.loc[df['egg_id'] == parent_hen]
    if parent_hen_df.empty:
        return []

    parent_hen_df = parent_hen_df.iloc[0]
    parent_hen_parent_hen = parent_hen_df['parent_hen_id']  # grandmother hen side
    parent_hen_parent_rooster = parent_hen_df['parent_rooster_id']  # grandfather hen side

    parent_hen_siblings = df.loc[
        (df['parent_hen_id'] == parent_hen_parent_hen)
        & (df['parent_rooster_id'] == parent_hen_parent_rooster)
        & (df['egg_id'] != parent_hen)
    ]

    cousins_parent_hen_side = df.loc[
        (df['parent_hen_id'].isin(parent_hen_siblings['egg_id']))
        | (df['parent_rooster_id'].isin(parent_hen_siblings['egg_id']))
    ]

    parent_rooster_df = df.loc[df['egg_id'] == parent_rooster]
    if parent_rooster_df.empty:
        return []

    parent_rooster_df = parent_rooster_df.iloc[0]
    parent_rooster_parent_hen = parent_rooster_df['parent_hen_id']  # grandmother rooster side
    parent_rooster_parent_rooster = parent_rooster_df['parent_rooster_id']  # grandfather rooster side

    parent_rooster_siblings = df.loc[
        (df['parent_hen_id'] == parent_rooster_parent_hen)
        & (df['parent_rooster_id'] == parent_rooster_parent_rooster)
        & (df['egg_id'] != parent_rooster)
    ]

    cousins_parent_rooster_side = df.loc[
        (df['parent_hen_id'].isin(parent_rooster_siblings['egg_id']))
        | (df['parent_rooster_id'].isin(parent_rooster_siblings['egg_id']))
    ]

    all_cousins = pd.concat([cousins_parent_rooster_side, cousins_parent_hen_side]).drop_duplicates()['egg_id'].to_list()

    return all_cousins

# only finding cousins for chicken in gen 3, since we don't know who grandparents are of chickens in gen 2
chicken_df['cousins'] = chicken_df.apply(lambda row: find_cousins(row['egg_id'], chicken_df) if row['generation'] == 3 else [], axis=1)

chicken_df

Unnamed: 0,egg_id,generation,gender,incubation_hall_location_id,name,favorite_song,parent_hen_id,parent_rooster_id,cousins
0,34146305,1,Rooster,,Stephen Garcia,"Boy's A Liar, Pt. 2 by PinkPantheress & Ice Spice",,,[]
1,14129157,1,Rooster,,Thomas Hill,I'm Good (Blue) by David Guetta & Bebe Rexha,,,[]
2,80947206,1,Rooster,,Kent Oconnor,Flowers by Miley Cyrus,,,[]
3,13824010,1,Rooster,,Steven Simmons,"Boy's A Liar, Pt. 2 by PinkPantheress & Ice Spice",,,[]
4,90796045,1,Rooster,,Donald Lee,Flowers by Miley Cyrus,,,[]
...,...,...,...,...,...,...,...,...,...
995,72282102,3,Hen,16.0,Felicia Martinez,Kill Bill by SZA,45044520.0,79933740.0,[]
996,28332023,3,Hen,7.0,Regina Fernandez,Unholy by Sam Smith & Kim Petras,91533954.0,57446627.0,[]
997,20555769,3,Hen,30.0,Judith Morales,Kill Bill by SZA,89231986.0,56691118.0,"[45298982, 68894305]"
998,87932922,3,Hen,19.0,Diana Mckay,Kill Bill by SZA,56451646.0,27271384.0,[]


In [65]:
# creating a 'random cousin' column for use in name tags

chicken_df['random_cousin'] = chicken_df['cousins'].apply(lambda x: random.choice(x) if x else None)

chicken_df

Unnamed: 0,egg_id,generation,gender,incubation_hall_location_id,name,favorite_song,parent_hen_id,parent_rooster_id,cousins,random_cousin
0,34146305,1,Rooster,,Stephen Garcia,"Boy's A Liar, Pt. 2 by PinkPantheress & Ice Spice",,,[],
1,14129157,1,Rooster,,Thomas Hill,I'm Good (Blue) by David Guetta & Bebe Rexha,,,[],
2,80947206,1,Rooster,,Kent Oconnor,Flowers by Miley Cyrus,,,[],
3,13824010,1,Rooster,,Steven Simmons,"Boy's A Liar, Pt. 2 by PinkPantheress & Ice Spice",,,[],
4,90796045,1,Rooster,,Donald Lee,Flowers by Miley Cyrus,,,[],
...,...,...,...,...,...,...,...,...,...,...
995,72282102,3,Hen,16.0,Felicia Martinez,Kill Bill by SZA,45044520.0,79933740.0,[],
996,28332023,3,Hen,7.0,Regina Fernandez,Unholy by Sam Smith & Kim Petras,91533954.0,57446627.0,[],
997,20555769,3,Hen,30.0,Judith Morales,Kill Bill by SZA,89231986.0,56691118.0,"[45298982, 68894305]",68894305.0
998,87932922,3,Hen,19.0,Diana Mckay,Kill Bill by SZA,56451646.0,27271384.0,[],


## 3. Generate the name tags table

In [68]:
name_tags = duckdb.query("""
        select
            base.name,
            base.favorite_song,
            base.generation,
            p_hen.name as parent_hen_name,
            p_hen.incubation_hall_location_id as parent_hen_location,
            p_roo.name as parent_rooster_name,
            p_roo.incubation_hall_location_id as parent_rooster_location,
            gp_hen_hen_side.name as maternal_grandma_name,
            gp_hen_hen_side.incubation_hall_location_id as maternal_grandma_location,
            gp_roo_hen_side.name as maternal_grandpa_name,
            gp_roo_hen_side.incubation_hall_location_id as maternal_grandpa_location,
            gp_hen_roo_side.name as paternal_grandma_name,
            gp_hen_roo_side.incubation_hall_location_id as paternal_grandma_location,       
            gp_roo_roo_side.name as paternal_grandpa_name,
            gp_roo_roo_side.incubation_hall_location_id as paternal_grandpa_location,
            cousins.name as cousin
        from chicken_df base
        left join chicken_df p_hen
            on base.parent_hen_id = p_hen.egg_id
        left join chicken_df p_roo
            on base.parent_rooster_id = p_roo.egg_id
        left join chicken_df gp_hen_hen_side
            on p_hen.parent_hen_id = gp_hen_hen_side.egg_id    
        left join chicken_df gp_roo_hen_side
            on p_hen.parent_rooster_id = gp_roo_hen_side.egg_id        
        left join chicken_df gp_roo_roo_side
            on p_roo.parent_rooster_id = gp_roo_roo_side.egg_id  
        left join chicken_df gp_hen_roo_side
            on p_roo.parent_hen_id = gp_hen_roo_side.egg_id 
        left join chicken_df cousins
            on base.random_cousin = cousins.egg_id             
""").to_df()

name_tags

Unnamed: 0,name,favorite_song,generation,parent_hen_name,parent_hen_location,parent_rooster_name,parent_rooster_location,maternal_grandma_name,maternal_grandma_location,maternal_grandpa_name,maternal_grandpa_location,paternal_grandma_name,paternal_grandma_location,paternal_grandpa_name,paternal_grandpa_location,cousin
0,Matthew Davila,I'm Good (Blue) by David Guetta & Bebe Rexha,2,Alison Campbell,,Anthony Hall,,,,,,,,,,
1,Brian Goodman,Die For You by The Weeknd,2,Mary Riggs,,David Roth,,,,,,,,,,
2,Mitchell Carpenter,Die For You by The Weeknd,2,Sandra Gutierrez,,Greg Cline,,,,,,,,,,
3,Chad Smith,Kill Bill by SZA,2,Sharon Peterson,,Matthew Potts,,,,,,,,,,
4,Kenneth Villanueva,I'm Good (Blue) by David Guetta & Bebe Rexha,2,Melissa Smith,,Keith Calderon,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,James Spencer,Last Night by Morgan Wallen,3,Christina Lewis,29.0,James Carpenter,25.0,Crystal Fritz,,Timothy Robinson,,Tiffany Knight,,James Smith,,Thomas Higgins
996,Joshua Huang,Flowers by Miley Cyrus,3,Erica Harris,20.0,Samuel Webb,5.0,Claudia Morgan,,David Gentry,,Joyce Sparks,,Tyler Crawford,,Jimmy Ross
997,Erin Goodman,Unholy by Sam Smith & Kim Petras,3,Courtney Lewis,25.0,Samuel Webb,5.0,Julia Lin,,William Hurley,,Joyce Sparks,,Tyler Crawford,,Jimmy Ross
998,Margaret Garcia,Die For You by The Weeknd,3,Donna Brown,20.0,Casey Gonzalez,14.0,Cynthia Smith,,Jason Dunlap,,Crystal Gaines,,Jonathan Johnson,,Adam Gibbs
