In [1]:
import pandas as pd 
from datetime import datetime
from datetime import date
import json
import pickle

def calculate_age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

A_athletes = pd.DataFrame({
    "name": ["George", "Veronica", "Marc", "Sally"],
    "sport": ["Tennis", None, "Golf", "Golf"],
    "wins": [20, 76, 12, 33],
    "date_of_birth": ["1989-02-01", "1999-10-11", "2011-07-26", "2004-04-17"]
})

B_athletes = pd.DataFrame({
    "name": ["Salma", "Viktor", "Raul", "Nadia"],
    "sport": ["Golf", "Tennis", None, "Skate"],
    "wins": [2, 33, 25, 71],
    "date_of_birth": ["1988-06-01", "1989-10-11", "2010-07-26", "2001-04-17"]
})
A_athletes

Unnamed: 0,name,sport,wins,date_of_birth
0,George,Tennis,20,1989-02-01
1,Veronica,,76,1999-10-11
2,Marc,Golf,12,2011-07-26
3,Sally,Golf,33,2004-04-17


# Data Transformations in Machine Learning

![SparkML](sparkml.webp)

Rodrigo Agundez - 06 April 2023

## The need for a special paradigm for data transformations in Machine Learning comes from:

1. The highly-demanding flexibility for functional transformations, non-functional transformations and their interoperability.
1. The need for non-functional transformations need to keep state:

    - Across time 
    
        An aggregated quantity used for normalization that updates every month.
        
    - Across space 
    
        A collection of categories that is computed in a batch job and later used at inference in a backend service.

#  1. Highly-demanding flexibility for transformations

## All data transformations are of 2-types:

- ### Functional

    All information needed is contained within the function operations and data to be transformed.
    
- ### Non-functional

    It needs information from outside the function operations and the data to be transformed.

# Example: Lowercase strings

In [2]:
A_athletes

Unnamed: 0,name,sport,wins,date_of_birth
0,George,Tennis,20,1989-02-01
1,Veronica,,76,1999-10-11
2,Marc,Golf,12,2011-07-26
3,Sally,Golf,33,2004-04-17


In [3]:
A_athletes["name"] = A_athletes["name"].str.lower()
A_athletes["sport"] = A_athletes["sport"].str.lower()
A_athletes

Unnamed: 0,name,sport,wins,date_of_birth
0,george,tennis,20,1989-02-01
1,veronica,,76,1999-10-11
2,marc,golf,12,2011-07-26
3,sally,golf,33,2004-04-17


## Functional

# Example: Extract month

In [4]:
A_athletes

Unnamed: 0,name,sport,wins,date_of_birth
0,george,tennis,20,1989-02-01
1,veronica,,76,1999-10-11
2,marc,golf,12,2011-07-26
3,sally,golf,33,2004-04-17


In [5]:
A_athletes["date_of_birth"] = pd.to_datetime(A_athletes["date_of_birth"])
A_athletes["month"] = A_athletes["date_of_birth"].dt.month
A_athletes

Unnamed: 0,name,sport,wins,date_of_birth,month
0,george,tennis,20,1989-02-01,2
1,veronica,,76,1999-10-11,10
2,marc,golf,12,2011-07-26,7
3,sally,golf,33,2004-04-17,4


## Functional

## Example:  Imputation on empty value


In [6]:
A_athletes

Unnamed: 0,name,sport,wins,date_of_birth,month
0,george,tennis,20,1989-02-01,2
1,veronica,,76,1999-10-11,10
2,marc,golf,12,2011-07-26,7
3,sally,golf,33,2004-04-17,4


In [7]:
A_athletes.loc[A_athletes["sport"].isnull(), "sport"] = "unknown"
A_athletes

Unnamed: 0,name,sport,wins,date_of_birth,month
0,george,tennis,20,1989-02-01,2
1,veronica,unknown,76,1999-10-11,10
2,marc,golf,12,2011-07-26,7
3,sally,golf,33,2004-04-17,4


## Functional

## Example: Age from date of birth

In [10]:
A_athletes

Unnamed: 0,name,sport,wins,date_of_birth,month,month_norm
0,george,tennis,20,1989-02-01,2,0.166667
1,veronica,unknown,76,1999-10-11,10,0.833333
2,marc,golf,12,2011-07-26,7,0.583333
3,sally,golf,33,2004-04-17,4,0.333333


In [11]:
A_athletes["age"] = A_athletes["date_of_birth"].apply(calculate_age)
A_athletes  

Unnamed: 0,name,sport,wins,date_of_birth,month,month_norm,age
0,george,tennis,20,1989-02-01,2,0.166667,34
1,veronica,unknown,76,1999-10-11,10,0.833333,23
2,marc,golf,12,2011-07-26,7,0.583333,11
3,sally,golf,33,2004-04-17,4,0.333333,18


## Non-functional - It needs information about today!

## Example: Normalizing Age of Athletes B based on Athletes A

In [12]:
max_age = A_athletes["age"].max()
B_athletes

Unnamed: 0,name,sport,wins,date_of_birth
0,Salma,Golf,2,1988-06-01
1,Viktor,Tennis,33,1989-10-11
2,Raul,,25,2010-07-26
3,Nadia,Skate,71,2001-04-17


In [13]:
B_athletes["date_of_birth"] = pd.to_datetime(B_athletes["date_of_birth"])  # functional
B_athletes["age"] = B_athletes["date_of_birth"].apply(calculate_age)  # non-functional
B_athletes["age_norm"] = B_athletes["age"] / max_age  # non-functional                    
B_athletes

Unnamed: 0,name,sport,wins,date_of_birth,age,age_norm
0,Salma,Golf,2,1988-06-01,34,1.0
1,Viktor,Tennis,33,1989-10-11,33,0.970588
2,Raul,,25,2010-07-26,12,0.352941
3,Nadia,Skate,71,2001-04-17,21,0.617647


## Non-functional: It needs the maximum age of athletes A

## Example:  Encoding Sport of Athletes B based on Athletes A

In [14]:
sports_idx = dict(zip(sorted(A_athletes["sport"]), range(len(A_athletes["sport"]))))
sports_idx

{'golf': 1, 'tennis': 2, 'unknown': 3}

In [15]:
A_athletes["sport_idx"] = A_athletes["sport"].map(sports_idx)
A_athletes

Unnamed: 0,name,sport,wins,date_of_birth,month,month_norm,age,sport_idx
0,george,tennis,20,1989-02-01,2,0.166667,34,2
1,veronica,unknown,76,1999-10-11,10,0.833333,23,3
2,marc,golf,12,2011-07-26,7,0.583333,11,1
3,sally,golf,33,2004-04-17,4,0.333333,18,1


In [16]:
B_athletes["sport"] = B_athletes["sport"].str.lower()
B_athletes.loc[B_athletes["sport"].isnull(), "sport"] = "unknown"
B_athletes["sport_idx"] = B_athletes["sport"].map(sports_idx)
B_athletes

Unnamed: 0,name,sport,wins,date_of_birth,age,age_norm,sport_idx
0,Salma,golf,2,1988-06-01,34,1.0,1.0
1,Viktor,tennis,33,1989-10-11,33,0.970588,2.0
2,Raul,unknown,25,2010-07-26,12,0.352941,3.0
3,Nadia,skate,71,2001-04-17,21,0.617647,


In [17]:
B_athletes["sport_idx"] = B_athletes["sport"].map(lambda x: sports_idx.get(x, sports_idx["unknown"]))
B_athletes

Unnamed: 0,name,sport,wins,date_of_birth,age,age_norm,sport_idx
0,Salma,golf,2,1988-06-01,34,1.0,1
1,Viktor,tennis,33,1989-10-11,33,0.970588,2
2,Raul,unknown,25,2010-07-26,12,0.352941,3
3,Nadia,skate,71,2001-04-17,21,0.617647,3


![Transformations](transformations.png)

#  2. Keep state across time and space

![time_and_space](time_and_space.png)

## Example:  Encoding Sport of Athletes B based on Athletes A

In [18]:
B_athletes = pd.DataFrame({
    "name": ["Salma", "Viktor", "Raul", "Nadia"],
    "sport": ["Golf", "Tennis", None, "Skate"],
    "wins": [2, 33, 25, 71],
    "date_of_birth": ["1988-06-01", "1989-10-11", "2010-07-26", "2001-04-17"]
})

In [19]:
sports_idx = dict(zip(sorted(A_athletes["sport"]), range(len(A_athletes["sport"]))))
with open("categories.json", "w") as fp:
    json.dump(sports_idx, fp)
sports_idx

{'golf': 1, 'tennis': 2, 'unknown': 3}

## Example:  Encoding Sport of Athletes B based on Athletes A

In [20]:
B_athletes

Unnamed: 0,name,sport,wins,date_of_birth
0,Salma,Golf,2,1988-06-01
1,Viktor,Tennis,33,1989-10-11
2,Raul,,25,2010-07-26
3,Nadia,Skate,71,2001-04-17


In [21]:
with open("categories.json", "r") as fp:
    sports_idx = json.load(fp)
sports_idx

{'golf': 1, 'tennis': 2, 'unknown': 3}

In [22]:
B_athletes["sport_idx"] = B_athletes["sport"].map(sports_idx)
B_athletes

Unnamed: 0,name,sport,wins,date_of_birth,sport_idx
0,Salma,Golf,2,1988-06-01,
1,Viktor,Tennis,33,1989-10-11,
2,Raul,,25,2010-07-26,
3,Nadia,Skate,71,2001-04-17,


## Example:  Encoding Sport of Athletes B based on Athletes A

In [23]:
B_athletes["sport"] = B_athletes["sport"].str.lower()
with open("categories.json", "r") as fp:
    sports_idx = json.load(fp)
B_athletes["sport_idx"] = B_athletes["sport"].map(sports_idx)
B_athletes

Unnamed: 0,name,sport,wins,date_of_birth,sport_idx
0,Salma,golf,2,1988-06-01,1.0
1,Viktor,tennis,33,1989-10-11,2.0
2,Raul,,25,2010-07-26,
3,Nadia,skate,71,2001-04-17,


In [24]:
class SportsCategories:
    
    def __init__(self, categories: dict, na_category: str):
        self.categories = categories
        self.na_category = na_category
        
sport_cat = SportsCategories(sports_idx, na_category="unknown")
with open("sport_categories.pkl", "wb") as fp:
    pickle.dump(sport_cat, fp)

## Example:  Encoding Sport of Athletes B based on Athletes A

In [25]:
B_athletes["sport"] = B_athletes["sport"].str.lower()
with open("sport_categories.pkl", "rb") as fp:
    sport_cat = pickle.load(fp)

In [26]:
B_athletes["sport_idx"] = B_athletes["sport"].map(sport_cat.categories)
B_athletes.loc[B_athletes["sport_idx"].isnull(), "sport_idx"] = sport_cat.categories[sport_cat.na_category]
B_athletes

Unnamed: 0,name,sport,wins,date_of_birth,sport_idx
0,Salma,golf,2,1988-06-01,1.0
1,Viktor,tennis,33,1989-10-11,2.0
2,Raul,,25,2010-07-26,3.0
3,Nadia,skate,71,2001-04-17,3.0


## Example:  Encoding Sport of Athletes B based on Athletes A

In [36]:
%reset -f
import pandas as pd

B_athletes = pd.DataFrame({"name": ["Salma", "Viktor", "Raul", "Nadia"], "sport": ["Golf", "Tennis", None, "Skate"], "wins": [2, 33, 25, 71], "date_of_birth": ["1988-06-01", "1989-10-11", "2010-07-26", "2001-04-17"]})
B_athletes

Unnamed: 0,name,sport,wins,date_of_birth
0,Salma,Golf,2,1988-06-01
1,Viktor,Tennis,33,1989-10-11
2,Raul,,25,2010-07-26
3,Nadia,Skate,71,2001-04-17


In [37]:
import pickle

B_athletes["sport"] = B_athletes["sport"].str.lower()
with open("sport_categories.pkl", "rb") as fp:
    sport_cat = pickle.load(fp)

AttributeError: Can't get attribute 'SportsCategories' on <module '__main__'>

# [SciKit-Learn Pipelines](02-scikit_learn_pipelines.ipynb)