In [0]:
%pip install databricks-feature-engineering scikit-learn codecarbon --quiet
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# Read data
df = spark.read.csv("dbfs:/Volumes/workspace/default/mlops/athletes.csv", header=True, inferSchema=True)
display(df.limit(5))

athlete_id,name,region,team,affiliate,gender,age,height,weight,fran,helen,grace,filthy50,fgonebad,run400,run5k,candj,snatch,deadlift,backsq,pullups,eat,train,background,experience,schedule,howlong
2554.0,Pj Ablang,South West,Double Edge,Double Edge CrossFit,Male,24.0,70.0,166.0,,,,,,,,220.0,,400.0,305.0,,,I workout mostly at a CrossFit Affiliate|I have a coach who determines my programming|I record my workouts|,I played youth or high school level sports|I regularly play recreational sports|,I began CrossFit with a coach (e.g. at an affiliate)|I have attended one or more specialty courses|I have had a life changing experience due to CrossFit|,I do multiple workouts in a day 2x a week|,4+ years|
3517.0,Derek Abdella,,,,Male,42.0,70.0,190.0,,,,,0.0,,,,,,,,,I have a coach who determines my programming|I record my workouts|,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affiliate)|I have attended one or more specialty courses|I have had a life changing experience due to CrossFit|,I do multiple workouts in a day 2x a week|,4+ years|
4691.0,,,,,,,,,,,,,,,,,,,,,,,,,,
5164.0,Abo Brandon,Southern California,LAX CrossFit,LAX CrossFit,Male,40.0,67.0,,211.0,645.0,300.0,,196.0,,,245.0,200.0,375.0,325.0,25.0,I eat 1-3 full cheat meals per week|,I workout mostly at a CrossFit Affiliate|I have a coach who determines my programming|I record my workouts|,I played youth or high school level sports|,I began CrossFit by trying it alone (without a coach)|I began CrossFit with a coach (e.g. at an affiliate)|I have completed the CrossFit Level 1 certificate course|I have attended one or more specialty courses|I have had a life changing experience due to CrossFit|I train other people|,I usually only do 1 workout a day|,4+ years|
5286.0,Bryce Abbey,,,,Male,32.0,65.0,149.0,206.0,465.0,,1053.0,,,1081.0,205.0,150.0,,325.0,50.0,I eat quality foods but don't measure the amount|,I workout mostly at a CrossFit Affiliate|I incorporate CrossFit.com workouts|I write my own programming|I record my workouts|,I played college sports|,I began CrossFit by trying it alone (without a coach)|I have completed the CrossFit Level 1 certificate course|I train other people|,I usually only do 1 workout a day|I strictly schedule my rest days|,1-2 years|


In [0]:
# Keep only rows with necessary features + target
df_clean = df.dropna(subset=["athlete_id", "age", "height", "weight", "candj", "snatch", "deadlift"])
df_clean = df_clean.select("athlete_id", "age", "height", "weight", "candj", "snatch", "deadlift")
display(df_clean.limit(5))

athlete_id,age,height,weight,candj,snatch,deadlift
6491.0,37.0,73.0,230.0,265.0,200.0,435.0
6610.0,21.0,72.0,175.0,0.0,0.0,0.0
7463.0,30.0,72.0,175.0,0.0,0.0,0.0
8242.0,40.0,68.0,177.0,225.0,185.0,365.0
11416.0,31.0,65.0,150.0,290.0,225.0,465.0


In [0]:
# Create Feature Store Tables (v1 & v2)
from databricks.feature_store import FeatureStoreClient
from pyspark.sql.functions import col

fs = FeatureStoreClient()

# Create the database if it does not exist
spark.sql("CREATE DATABASE IF NOT EXISTS workspace.mlops")

# Feature version 1: raw features
features_v1 = df_clean.select("athlete_id", "age", "height", "weight", "candj", "snatch")
fs.create_table(
    name="workspace.mlops.athletes_features_v1",
    primary_keys=["athlete_id"],
    df=features_v1,
    description="Athletes v1: raw features"
)

# Feature version 2: adds derived 'power_metric'
features_v2 = features_v1.withColumn("power_metric", col("candj") + col("snatch")) \
                         .select("athlete_id", "age", "height", "weight", "power_metric")
fs.create_table(
    name="workspace.mlops.athletes_features_v2",
    primary_keys=["athlete_id"],
    df=features_v2,
    description="Athletes v2: includes derived power_metric"
)

<FeatureTable: name='workspace.mlops.athletes_features_v2', table_id='1b42b366-fd88-4633-9e0d-c8a174d888af', description='Athletes v2: includes derived power_metric', primary_keys=['athlete_id'], partition_columns=[], features=['athlete_id', 'age', 'height', 'weight', 'power_metric'], creation_timestamp=1752637212561, online_stores=[], notebook_producers=[], job_producers=[], table_data_sources=[], path_data_sources=[], custom_data_sources=[], timestamp_keys=[], tags={}>

In [0]:
from databricks.feature_store import FeatureStoreClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from codecarbon import EmissionsTracker
import mlflow
import pandas as pd

In [0]:
# Define Training Function
def train_model(version: str, n_estimators: int, max_depth: int):
    fs = FeatureStoreClient()
    table_name = f"workspace.mlops.athletes_features_{version}"
    
    # Read features from Feature Store
    features_df = fs.read_table(table_name).toPandas()
    
    # Read target values
    raw_df = spark.read.csv("dbfs:/Volumes/workspace/default/mlops/athletes.csv", header=True, inferSchema=True)
    target_df = raw_df.select("athlete_id", "deadlift").dropna().toPandas()
    
    # Join features with target
    data = pd.merge(features_df, target_df, on="athlete_id")
    X = data.drop(columns=["athlete_id", "deadlift"])
    y = data["deadlift"]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build ML pipeline
    pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42))
    ])

    # Start emissions tracking
    tracker = EmissionsTracker()
    tracker.start()

    with mlflow.start_run(run_name=f"{version}_n{n_estimators}_d{max_depth}"):
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)

        rmse = mean_squared_error(y_test, preds, squared=False)
        r2 = r2_score(y_test, preds)
        emissions = tracker.stop()

        # Log to MLflow
        mlflow.log_param("feature_version", version)
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("emissions_kg", emissions)
        mlflow.sklearn.log_model(pipeline, "model")

        print(f"✅ v{version} | n={n_estimators} d={max_depth} | RMSE={rmse:.2f} | R2={r2:.2f} | CO2={emissions:.4f} kg")

In [0]:
# Run All 4 Experiments
configs = [
    ("v1", 100, 5),
    ("v1", 200, 10),
    ("v2", 100, 5),
    ("v2", 200, 10)
]

for version, n, d in configs:
    train_model(version, n, d)

[codecarbon INFO @ 03:50:52] [setup] RAM Tracking...
[codecarbon INFO @ 03:50:52] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 03:50:53] CPU Model on constant consumption mode: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
[codecarbon INFO @ 03:50:53] [setup] GPU Tracking...
[codecarbon INFO @ 03:50:53] No GPU found.
[codecarbon INFO @ 03:50:53] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 03:50:53] >>> Tracker's metadata:
[codecarbon INFO @ 03:50:53]   Platform system: Linux-5.15.0-1072-aws-x86_64-with-glibc2.35
[codecarbon INFO @ 03:50:53]   Python version: 3.11.10
[codecarbon INFO @ 03:50:53]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 03:50:53]   Available RAM : 15.334 GB
[codecarbon 

✅ vv1 | n=100 d=5 | RMSE=45177.14 | R2=0.56 | CO2=0.0000 kg


[codecarbon INFO @ 03:51:09] [setup] RAM Tracking...
[codecarbon INFO @ 03:51:09] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 03:51:10] CPU Model on constant consumption mode: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
[codecarbon INFO @ 03:51:10] [setup] GPU Tracking...
[codecarbon INFO @ 03:51:10] No GPU found.
[codecarbon INFO @ 03:51:10] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 03:51:10] >>> Tracker's metadata:
[codecarbon INFO @ 03:51:10]   Platform system: Linux-5.15.0-1072-aws-x86_64-with-glibc2.35
[codecarbon INFO @ 03:51:10]   Python version: 3.11.10
[codecarbon INFO @ 03:51:10]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 03:51:10]   Available RAM : 15.334 GB
[codecarbon 

✅ vv1 | n=200 d=10 | RMSE=48877.99 | R2=0.48 | CO2=0.0001 kg


[codecarbon INFO @ 03:51:39] [setup] RAM Tracking...
[codecarbon INFO @ 03:51:39] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 03:51:40] CPU Model on constant consumption mode: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
[codecarbon INFO @ 03:51:40] [setup] GPU Tracking...
[codecarbon INFO @ 03:51:40] No GPU found.
[codecarbon INFO @ 03:51:40] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 03:51:40] >>> Tracker's metadata:
[codecarbon INFO @ 03:51:40]   Platform system: Linux-5.15.0-1072-aws-x86_64-with-glibc2.35
[codecarbon INFO @ 03:51:40]   Python version: 3.11.10
[codecarbon INFO @ 03:51:40]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 03:51:40]   Available RAM : 15.334 GB
[codecarbon 

✅ vv2 | n=100 d=5 | RMSE=71165.05 | R2=-0.10 | CO2=0.0000 kg


[codecarbon INFO @ 03:51:56] [setup] RAM Tracking...
[codecarbon INFO @ 03:51:56] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 03:51:57] CPU Model on constant consumption mode: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
[codecarbon INFO @ 03:51:57] [setup] GPU Tracking...
[codecarbon INFO @ 03:51:57] No GPU found.
[codecarbon INFO @ 03:51:57] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 03:51:57] >>> Tracker's metadata:
[codecarbon INFO @ 03:51:57]   Platform system: Linux-5.15.0-1072-aws-x86_64-with-glibc2.35
[codecarbon INFO @ 03:51:57]   Python version: 3.11.10
[codecarbon INFO @ 03:51:57]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 03:51:57]   Available RAM : 15.334 GB
[codecarbon 

✅ vv2 | n=200 d=10 | RMSE=71368.59 | R2=-0.11 | CO2=0.0001 kg
