In [1]:
from excelbird import Schema
import pandas as pd

In [2]:
df_employee_raw = pd.DataFrame( [["Jared", "Richards", 45, 80, "DET"], ["Emily", "Seitz", 87, 60, "DNA"], ["Nick", "Smarts", 23, 50, "DET"]], columns=["Emp FName", "Emp LName", "Hours Worked", "Emp Hourly Rate", "Domain"],)
df_employee_raw

Unnamed: 0,Emp FName,Emp LName,Hours Worked,Emp Hourly Rate,Domain
0,Jared,Richards,45,80,DET
1,Emily,Seitz,87,60,DNA
2,Nick,Smarts,23,50,DET


In [3]:
# Schema is just a subclass of dictionary, where at first, the only difference is that
# you have to use `key = "value"` instead of `"key": "value"` to ensure keys are valid python
schema = Schema(
    # var_name = ("Input Col Name", "Output Col Name")
    last_name=("Emp LName", "Last Name"),
    first_name=("Emp FName", "First Name"),
    hours="Hours Worked",  # will be applied to both input and output
    rate=("Emp Hourly Rate", "Hourly Rate"),
)
# Just as a reminder it's still a dictionary...
schema.keys()

dict_keys(['last_name', 'first_name', 'hours', 'rate'])

In [4]:
# Now our main code can import the roster schema, and use it
# to format our input data into something that's friendly to work with
df = schema.select_inputs(df_employee_raw)
df

Unnamed: 0,last_name,first_name,hours,rate
0,Richards,Jared,45,80
1,Seitz,Emily,87,60
2,Smarts,Nick,23,50


Calling `.select_inputs()` did the following:
1. Check if all desired columns are present in input data, and let you know (by throwing an error) if one is missing
2. Select the desired columns, and ignore the rest
3. Rename them to your desired standardized variable names
4. Re-order them to the order specified in your schema

In [5]:
# Now, when you're ready to output it:
df_out = schema.select_outputs(df)
df_out

Unnamed: 0,Last Name,First Name,Hours Worked,Hourly Rate
0,Richards,Jared,45,80
1,Seitz,Emily,87,60
2,Smarts,Nick,23,50


Calling `.select_outputs()` does the inverse:
1. Ensure all required columns are present. Throw a friendly error if not.
2. Select desired columns and ignore the rest
3. Rename to output names
4. Re-order columns

In [6]:
# We can also slice a schema like we would a dataframe, with double brackets.
# This returns an entirely new object, not a slice
schema_mini = schema[["rate", "last_name"]]

df_mini = schema_mini.select_outputs(df)
df_mini

Unnamed: 0,Hourly Rate,Last Name
0,80,Richards
1,60,Seitz
2,50,Smarts


In [7]:
# In the real world, your code will probably combine columns from multiple schemas,
# and create new columns. How do we declare/enforce an output schema?
schema_composite = Schema(
    schema[[
        "hours",
        "first_name"
    ]],
    schema_mini,
    daily_rate="Daily Rate",
)
# We can pass in existing schemas when constructing a new one. This is great
# because it lets us express exactly where columns are coming from.
print(*schema_composite.items(), sep="\n")

('hours', Column(input='Hours Worked', output='Hours Worked'))
('first_name', Column(input='Emp FName', output='First Name'))
('rate', Column(input='Emp Hourly Rate', output='Hourly Rate'))
('last_name', Column(input='Emp LName', output='Last Name'))
('daily_rate', Column(input='Daily Rate', output='Daily Rate'))


In [8]:
# If we try to format an output using `schema_composite,`
# We'll get a helpful error that lets us know we're missing a column
# df_out = schema_composite.select_outputs(df)

In [9]:
df["daily_rate"] = df.rate * 8

df_out = schema_composite.select_outputs(df)
df_out

Unnamed: 0,Hours Worked,First Name,Hourly Rate,Last Name,Daily Rate
0,45,Jared,80,Richards,640
1,87,Emily,60,Seitz,480
2,23,Nick,50,Smarts,400


## A more realistic use-case
---

In [10]:
# Ignore these lines, and pretend you're reading this data in
df_emp_raw = pd.DataFrame(
    [["Jared", "Richards", 24, "red"], ["Emily", "Seitz", 55, "green"], ["Nick", "Smarts", 33, "blue"]],
    columns=["Employee First Name", "Employee Last Name", "Age", "Favorite Color"],
)
df_roster_raw = pd.DataFrame(
    [["Jared", "Richards", 45, 80, "DET"], ["Emily", "Seitz", 87, 60, "DNA"], ["Nick", "Smarts", 23, 50, "DET"]],
    columns=["Emp FName", "Emp LName", "Hours Worked", "Emp Hourly Rate", "Domain"],
)
display(df_emp_raw, df_roster_raw)

Unnamed: 0,Employee First Name,Employee Last Name,Age,Favorite Color
0,Jared,Richards,24,red
1,Emily,Seitz,55,green
2,Nick,Smarts,33,blue


Unnamed: 0,Emp FName,Emp LName,Hours Worked,Emp Hourly Rate,Domain
0,Jared,Richards,45,80,DET
1,Emily,Seitz,87,60,DNA
2,Nick,Smarts,23,50,DET


We've got two input data sources: employee data, and roster data (see above).

They have shared columns (first and last name) which we want to join on, but are named differently.

In our code, we would need to rename these columns before joining. We would also need to reference the exact names of the input columns, which is not only annoying (because they're long) but also inconvenient if the input column names change.

We also might write our finished data to a file that might later be read by another script in the pipeline. What if we change the column names of our outputted data? Then all other scripts who read our outputted data will need to be changed. That's also inconvenient.

### Solution:
Declare ahead of time, in a separate file:
- A schema for each of our input datasets
- A schema for our final output. Then, another script in the pipeline who reads our output can reference this schema when reading data.

In [11]:
sch_employee = Schema(
    last_name=("Employee Last Name", "Last Name"),
    first_name=("Employee First Name", "First Name"),
    age="Age",
)
sch_roster = Schema(
    last_name="Emp LName",
    first_name="Emp FName",
    hours="Hours Worked",
    rate=("Emp Hourly Rate", "Hourly Rate"),
)
sch_output = Schema(
    sch_employee[[
        "first_name",
        "last_name",
        "age"
    ]],
    sch_roster[["rate"]],
    pay="Total Pay",
    notes="Notes",
)
# With an output schema, it's clear to the reader what our script needs to do:
# join emp and roster, and add two new columns

In [12]:
# Here's a look at the source data again
display(df_emp_raw, df_roster_raw)

Unnamed: 0,Employee First Name,Employee Last Name,Age,Favorite Color
0,Jared,Richards,24,red
1,Emily,Seitz,55,green
2,Nick,Smarts,33,blue


Unnamed: 0,Emp FName,Emp LName,Hours Worked,Emp Hourly Rate,Domain
0,Jared,Richards,45,80,DET
1,Emily,Seitz,87,60,DNA
2,Nick,Smarts,23,50,DET


In [13]:
df_emp = sch_employee.select_inputs(df_emp_raw)
df_roster = sch_roster.select_inputs(df_roster_raw)
display(df_emp, df_roster)

Unnamed: 0,last_name,first_name,age
0,Richards,Jared,24
1,Seitz,Emily,55
2,Smarts,Nick,33


Unnamed: 0,last_name,first_name,hours,rate
0,Richards,Jared,45,80
1,Seitz,Emily,87,60
2,Smarts,Nick,23,50


In [14]:
df = df_emp.merge(df_roster, on=["first_name", "last_name"])
df

Unnamed: 0,last_name,first_name,age,hours,rate
0,Richards,Jared,24,45,80
1,Seitz,Emily,55,87,60
2,Smarts,Nick,33,23,50


In [15]:
# Is it ready to match our output schema?
try:
    df = sch_output.select_outputs(df)
except Exception as e:
    print(e)

Please add columns, ['pay', 'notes'] before outputting.


In [16]:
df["pay"] = df.hours * df.rate
df["notes"] = ""

# Let's try that again
df_output = sch_output.select_outputs(df)
df_output

Unnamed: 0,First Name,Last Name,Age,Hourly Rate,Total Pay,Notes
0,Jared,Richards,24,80,3600,
1,Emily,Seitz,55,60,5220,
2,Nick,Smarts,33,50,1150,


### Read data using the output schema of another script

- Remember to call `.reset_inputs()`!

In [17]:
# Pretend we're in another script who reads in our previous output
df_raw = df_output.copy()
df_raw

Unnamed: 0,First Name,Last Name,Age,Hourly Rate,Total Pay,Notes
0,Jared,Richards,24,80,3600,
1,Emily,Seitz,55,60,5220,
2,Nick,Smarts,33,50,1150,


In [18]:
# We can use the `schema_output` again, but the input column
# names are out of date. Reset them first.
schema = sch_output.reset_inputs()

df = schema.select_inputs(df_raw)
df

Unnamed: 0,first_name,last_name,age,rate,pay,notes
0,Jared,Richards,24,80,3600,
1,Emily,Seitz,55,60,5220,
2,Nick,Smarts,33,50,1150,


## Create a schema that knows how to read its input data
---
If you have some prep-work that's always required after reading a certain dataset, it would be quite a luxury if the schema could handle this itself.

Or, what if you have multiple input sources that need to be joined immediately after being read? Instead of creating a separate schema for each, just create a single schema with the shared fields from each, and tell it how to populate each field.

### Example: Concat first and last name into full name on read
---
Let's go back to the first example. Instead of declaring `first_name` and `last_name`, we'll just declare `name`, and tell the schema how to populate it

In [19]:
# Input source
df_employee_raw

Unnamed: 0,Emp FName,Emp LName,Hours Worked,Emp Hourly Rate,Domain
0,Jared,Richards,45,80,DET
1,Emily,Seitz,87,60,DNA
2,Nick,Smarts,23,50,DET


In [20]:
# Create a subclass of Schema with a read() function
class EmployeeSchema(Schema):
    def read(self):
        # pretend to read in data from a source
        df = df_employee_raw
        # Create name column
        df["Employee Name"] = df["Emp FName"] + " " + df["Emp LName"]
        return df

schema = EmployeeSchema(
    name="Employee Name",
    hours="Hours Worked",
    rate=("Emp Hourly Rate", "Hourly Rate"),
)

df = schema.read()
df = schema.select_inputs(df)
df

Unnamed: 0,name,hours,rate
0,Jared Richards,45,80
1,Emily Seitz,87,60
2,Nick Smarts,23,50
