In [0]:
# Install the databricks-feature-engineering package if not already installed
try:
    import databricks.feature_engineering
    print("Package already installed.")
except ImportError:
    print("Installing package...")
    %pip install databricks-feature-engineering
    dbutils.library.restartPython()

In [0]:
# Import necessary libraries
from databricks.feature_engineering import FeatureEngineeringClient
from pyspark.ml.feature import StringIndexer
import warnings

In [0]:
# Ignore warnings
warnings.filterwarnings("ignore")

# Load data
file_path = "file:/Workspace/Users/yasodhashree91@gmail.com/oms-databricks/04_AI_ML/model_evaluation/classification/data_files/housing_loan.csv"
loan_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display sample
loan_df.limit(5).display()

In [0]:
# Drop target column for feature prep
features_df = loan_df.drop("IsDefault")

# Drop rows with any nulls
features_df = features_df.dropna()

In [0]:


Categoricals = ['HasCoSigner','HasDependents', 'HasMortgage',
           'MaritalStatus', 'EmploymentType', 'Education']

for col in Categoricals:
    indexer = StringIndexer(inputCol=col, outputCol=f"{col}_idx")
    features_df = indexer.fit(features_df).transform(features_df)

# If you want to drop original columns and rename indexed columns back:
for col in Categoricals:
    features_df = features_df.drop(col)

features_df.limit(5).display()

In [0]:
# Save to feature store
fe = FeatureEngineeringClient()

feature_table_name = "realestate.ml.loan_features"


# Drop if exists
try:
    fe.drop_table(name=feature_table_name)
except:
    pass

# Create feature table
fe.create_table(
    name=feature_table_name,
    primary_keys=["LoanID"],
    df=features_df,
    description="Processed housing loan features for classification"
)


In [0]:
# Confirm saved
display(spark.table("realestate.ml.loan_features").limit(5))