In [0]:
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.getOrCreate()
df_spark = spark.table("labeled_step_test")
df = df_spark.toPandas()
df.head()

In [0]:
feature_cols_numeric = ["distance_cm"]
feature_cols_categorical = ["sensor_type", "device_id"]
label_col = "step_label"

In [0]:
from sklearn.model_selection import train_test_split
X = df[feature_cols_numeric + feature_cols_categorical]
y = df[label_col]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

numeric_transformer = StandardScaler()

In [0]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [0]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_cols_numeric),
        ("cat", categorical_transformer, feature_cols_categorical)
    ]
)

In [0]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[
    ("preprocess", preprocessor)
])

In [0]:
pipeline.fit(X_train)

X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

In [0]:
import joblib
import os

# 1. Define a path that works in Databricks (using the /tmp/ folder is easiest for labs)
base_path = "/tmp/etl_pipeline/"

# 2. Create the directory so you don't get the 'No such file' error
if not os.path.exists(base_path):
    os.makedirs(base_path)
    print(f"Created directory: {base_path}")

# 3. Save your files using the new base_path
joblib.dump(pipeline, os.path.join(base_path, "stedi_feature_pipeline.pkl"))
joblib.dump(X_train_transformed, os.path.join(base_path, "X_train_transformed.pkl"))
joblib.dump(X_test_transformed, os.path.join(base_path, "X_test_transformed.pkl"))
joblib.dump(y_test, os.path.join(base_path, "y_test.pkl"))
joblib.dump(y_train, os.path.join(base_path, "y_train.pkl"))

print("All files saved successfully to /tmp/etl_pipeline/!")

##Ethics Reflection

Using a consistent, reproducible feature pipeline prevents unfairness by ensuring that every data point, regardless of its source, is treated with the exact same mathematical logic. In Machine Learning, "hidden bias" often creeps in when we process different groups of data inconsistently, but a pipeline locks our preprocessing (like scaling and encoding) into a stable standard. This technical consistency mirrors the spiritual principle of Equity, as taught in the scriptures: God is "no respecter of persons" (Acts 10:34) and operates by unchanging laws. By building reliable pipelines, we ensure our models do not favor certain device types or demographics due to sloppy or varied data handling. Just as consistent spiritual habits build a stable foundation, consistent data habits build trustworthy and fair AI systems.