In [0]:
# Install the databricks-feature-engineering package
try:
    import databricks.feature_engineering
    print("Package already installed.")
except ImportError:
    print("Installing package...")
    %pip install databricks-feature-engineering
    dbutils.library.restartPython()

In [0]:
# Note: Use ML-supported clusters (labeled as "X.Y LTS ML" e.g., "13.3 LTS ML") for machine learning workloads.
# Import required libraries
from pyspark.sql.functions import current_date, year, col
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array
from databricks.feature_engineering import FeatureEngineeringClient

In [0]:
# Load Housing Data from CSV
base_file_path = "file:/Workspace/Users/yasodhashree91@gmail.com/oms-databricks/04_AI_ML/model_evaluation/regression/data_files/"
housing_df = spark.read.csv(base_file_path + "housing.csv", header=True, inferSchema=True)

# Display first 5 rows for understanding
print("Housing Data Preview:")
housing_df.limit(5).display()

In [0]:
# Drop the target label column before handling missing values in the features
features_df = housing_df.drop("Price")

# Drop rows with any null values in the features
features_df = features_df.dropna()

In [0]:
# Create House_Age feature based on Year_Built
housing_df = housing_df.withColumn("House_Age", year(current_date()) - col("Year_Built"))

In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.functions import vector_to_array

# Columns to scale
numeric_cols = ["Square_Feet", "Num_Bedrooms", "Num_Bathrooms", "Num_Floors", 
                "Garage_Size", "Location_Score", "Distance_to_Center", "House_Age"]

# Cast all numeric columns to float
for c in numeric_cols:
    housing_df = housing_df.withColumn(c, col(c).cast("float"))

# Step 1: Assemble into vector
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features_vector")
housing_df = assembler.transform(housing_df)

# Step 2: Apply StandardScaler directly
scaler = StandardScaler(inputCol="features_vector", outputCol="scaled_vector", withStd=True, withMean=False)
scaler_model = scaler.fit(housing_df)
housing_df_scaled = scaler_model.transform(housing_df)

# Step 3: Convert vector to array
housing_df_scaled = housing_df_scaled.withColumn("scaled_array", vector_to_array("scaled_vector"))

# Step 4: Create individual scaled columns
for i, c in enumerate(numeric_cols):
    housing_df_scaled = housing_df_scaled.withColumn(f"scaled_{c}", col("scaled_array")[i])

# Display final result
display(housing_df_scaled.select("House_ID", *[f"scaled_{c}" for c in numeric_cols]).limit(3))


In [0]:
# Select required columns for feature store
final_cols = ["House_ID", "Has_Garden", "Has_Pool"] + [f"scaled_{col}" for col in numeric_cols]

housing_features_final = housing_df_scaled.select(final_cols)

# Display final feature table
print("Final Features for Feature Store:")
housing_features_final.limit(5).display()

In [0]:
# Save Features to Databricks Feature Store

fe = FeatureEngineeringClient()

spark.sql("CREATE SCHEMA IF NOT EXISTS realestate.ml")

feature_table_name = "realestate.ml.housing_features"

# Drop existing table (if any)
try:
    fe.drop_table(name=feature_table_name)
except:
    pass

# Create feature table
fe.create_table(
    name=feature_table_name,
    primary_keys=["House_ID"],
    df=housing_features_final,
    description="Housing features for price prediction"
)


In [0]:
display(spark.table("realestate.ml.housing_features").limit(5))