# Note: Use ML-supported clusters (labeled as "X.Y LTS ML" e.g., "13.3 LTS ML") for machine learning workloads.

In [0]:
# Import required libraries
from pyspark.sql.functions import current_date, year, col
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array

In [0]:
# Load Housing Data from CSV
# base_file_path = "file:/Workspace/Users/yasodhashree91@gmail.com/oms-databricks/04_AI_ML/feature_eng/data_files/"
# housing_df = spark.read.csv(base_file_path + "housing.csv", header=True, inferSchema=True)

# Display first 5 rows for understanding
# print("Housing Data Preview:")
# housing_df.limit(5).display()

In [0]:
# Manually define housing data if reading from CSV is not working
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType

# Define schema
schema = StructType([
    StructField("House_ID", IntegerType(), True),
    StructField("Square_Feet", DoubleType(), True),
    StructField("Num_Bedrooms", IntegerType(), True),
    StructField("Num_Bathrooms", IntegerType(), True),
    StructField("Num_Floors", IntegerType(), True),
    StructField("Year_Built", IntegerType(), True),
    StructField("Has_Garden", IntegerType(), True),
    StructField("Has_Pool", IntegerType(), True),
    StructField("Garage_Size", IntegerType(), True),
    StructField("Location_Score", DoubleType(), True),
    StructField("Distance_to_Center", DoubleType(), True),
    StructField("Price", DoubleType(), True)
])

# Create data (extend as many as you needed)
data = [
    (1, 143.63502971184062, 1, 3, 3, 1967, 1, 1, 48, 8.297631202876449, 5.935733640397012, 602134.816746586),
    (2, 287.67857660247904, 1, 2, 1, 1949, 0, 1, 37, 6.061465649334798, 10.827392203145374, 591425.1353862194),
    (3, 232.99848545285127, 1, 3, 2, 1923, 1, 0, 14, 2.9114424778517902, 6.904599073399449, 464478.6968798775),
    (4, 199.66462104925915, 5, 2, 2, 1918, 0, 0, 17, 2.0709491817657124, 8.284018511436607, 583105.655996478),
    (5, 89.00466011060914, 4, 3, 3, 1999, 1, 0, 34, 1.523277856626788, 14.648277296253372, 619879.1425227895)
]

# Create DataFrame
housing_df = spark.createDataFrame(data, schema=schema)

# Display preview
display(housing_df)


In [0]:
# Drop the target label column before handling missing values in the features
features_df = housing_df.drop("Price")

# Drop rows with any null values in the features
features_df = features_df.dropna()

In [0]:
numeric_cols = ["Square_Feet", "Num_Bedrooms", "Num_Bathrooms", "Num_Floors", 
                "Garage_Size", "Location_Score", "Distance_to_Center", "House_Age"]

assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features_vec")
scaler = StandardScaler(inputCol="features_vec", outputCol="scaled_features", withMean=True, withStd=True)

pipeline = Pipeline(stages=[assembler, scaler])
scaler_model = pipeline.fit(housing_df)
housing_df_scaled = scaler_model.transform(housing_df)

housing_df_scaled = housing_df_scaled.withColumn("scaled_features_array", vector_to_array("scaled_features"))

# Add scaled columns back
for i, col_name in enumerate(numeric_cols):
    housing_df_scaled = housing_df_scaled.withColumn(f"scaled_{col_name}", col("scaled_features_array")[i])

In [0]:
# Select required columns for feature store
final_cols = ["House_ID", "Has_Garden", "Has_Pool"] + [f"scaled_{col}" for col in numeric_cols]

housing_features_final = housing_df_scaled.select(final_cols)

# Display final feature table
print("Final Features for Feature Store:")
housing_features_final.limit(5).display()

In [0]:
# Save Features to Databricks Feature Store

fe = FeatureEngineeringClient()

feature_table_name = "housing.feature_store.housing_features"

# Drop existing table (if any)
try:
    fe.drop_table(name=feature_table_name)
except:
    pass

# Create feature table
fe.create_table(
    name=feature_table_name,
    primary_keys=["House_ID"],
    df=housing_features_final,
    description="Housing features for price prediction"
)
