In [0]:
# Install the databricks-feature-engineering package
try:
    import databricks.feature_engineering
    print("Package already installed.")
except ImportError:
    print("Installing package...")
    %pip install databricks-feature-engineering
    dbutils.library.restartPython()

In [0]:
# Note: Use ML-supported clusters (labeled as "X.Y LTS ML" e.g., "13.3 LTS ML") for machine learning workloads.

from pyspark.sql.functions import avg, sum, count, when, current_date, datediff, col, split, regexp_replace, expr
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array
from databricks.feature_engineering import FeatureEngineeringClient

In [0]:
# Load customers, transactions, and interactions data from CSV files (CSV used here for demo; typically, data comes from tables)
base_file_path = "file:/Workspace/Users/yasodhashree91@gmail.com/oms-databricks/04_AI_ML/feature_eng/data_files/"
customers_df = spark.read.csv(base_file_path + "customers.csv", header=True)
transactions_df = spark.read.csv(base_file_path + "transactions.csv", header=True)
interactions_df = spark.read.csv(base_file_path + "interactions.csv", header=True)


# Display the first few rows of each DataFrame for understanding
print("Customers Data:")
customers_df.limit(5).display()

print("Transactions Data:")
transactions_df.limit(5).display()

print("Interactions Data:")
interactions_df.limit(5).display()


In [0]:
# Function to rename columns to be consistent and compatible with various ML algorithms and Databricks Feature Store
# Define Function
def renameColumns(df):
    renamed_df = df
    for column in df.columns:
        clean_name = column.replace(' ', '_').replace('/', '_').replace('-', '_')
        renamed_df = renamed_df.withColumnRenamed(column, clean_name)
    return renamed_df


# Run function for each soruce dataframe
customers_df = renameColumns(customers_df)
transactions_df = renameColumns(transactions_df)
interactions_df = renameColumns(interactions_df)

In [0]:

# Fill missing or NULL values with the average value calculated from all remaining customers
avg_age = int(customers_df.select(avg("age")).first()[0])
avg_score = int(customers_df.select(avg("credit_score")).first()[0])

customers_df = customers_df.fillna({
    "age": avg_age,
    "credit_score": avg_score
})

# Skip rows when important fields like income range are missing
customers_df = customers_df.filter(customers_df["income_range"].isNotNull())

In [0]:

# Aggregate transaction data (total spent and transaction count) per customer and join with customers dataframe

txn_agg = transactions_df.groupBy("customer_id").agg(
    sum("amount").alias("total_spent"),
    count("transaction_id").alias("txn_count")
)
customers_features = customers_df.join(txn_agg, on="customer_id", how="left").fillna(0)

In [0]:

# Aggregate interaction data (average satisfaction and complaint count) per customer and join with customer dataframe

interaction_agg = interactions_df.groupBy("customer_id").agg(
    avg("satisfaction_score").alias("avg_satisfaction"),
    sum(when(interactions_df.reason == "Complaint", 1).otherwise(0)).alias("complaint_count")
)

customers_features = customers_features.join(interaction_agg, on="customer_id", how="left").fillna(0)

In [0]:
# Create feature income_avg based on income_range
# Example: "2000-4000" becomes 3000 ((2000 + 4000) / 2)

customers_features = customers_features.withColumn(
    "income_avg",
    (
        expr("cast(regexp_replace(split(income_range, '-')[0], ',', '') as double)") +
        expr("cast(regexp_replace(split(income_range, '-')[1], ',', '') as double)")
    ) / 2
).drop("income_range")

In [0]:
# Calculate customer_tenure as days from join_date till today
customers_features = customers_features.withColumn(
    "customer_tenure",
    datediff(current_date(), col("join_date"))
).drop("join_date")

In [0]:
# Drop columns that are non-signals for the prediction we are making
drop_cols = ["first_name", "last_name", "email"]
customers_features = customers_features.drop(*drop_cols)

In [0]:
# Ordinal Encoding - When categories have natural order 
# Example: card_tier (Basic < Silver < Gold < Platinum)


customers_features = customers_features.withColumn(
    'card_tier_encoded',
    when(col('card_tier') == 'Basic', 0)
    .when(col('card_tier') == 'Silver', 1)
    .when(col('card_tier') == 'Gold', 2)
    .when(col('card_tier') == 'Platinum', 3)
    .otherwise(None)
    .cast("double")
).drop("card_tier")

In [0]:
# One-Hot Encoding - When categories have no meaningful order
# Example: gender (Male/Female/Other)

customers_features = customers_features \
    .withColumn("gender_female", when(col("gender") == "Female", 1).otherwise(0).cast("double")) \
    .withColumn("gender_male", when(col("gender") == "Male", 1).otherwise(0).cast("double")) \
    .withColumn("gender_other", when(col("gender") == "Other", 1).otherwise(0).cast("double")) \
    .drop("gender")

display(customers_features.limit(3))

In [0]:
# Scale numeric features like age (~0-100) and income (~10000s) to a similar scale to avoid ML giving more weight to larger-scale features (e.g., income)

columns_to_scale = [
    "age", "credit_score", "income_avg", "customer_tenure",
    "total_spent", "txn_count", "avg_satisfaction", "complaint_count"
]

# Cast columns to float
for c in columns_to_scale:
    customers_features = customers_features.withColumn(c, col(c).cast("float"))

assembler = VectorAssembler(inputCols=columns_to_scale, outputCol="features_vector")
scaler = StandardScaler(inputCol="features_vector", outputCol="scaled_vector", withStd=True, withMean=False)

pipeline = Pipeline(stages=[assembler, scaler])
model = pipeline.fit(customers_features)
customers_features_scaled = model.transform(customers_features)

# Convert vector column to array column using built-in function (faster, no udf)
customers_features_scaled = customers_features_scaled.withColumn(
    "scaled_array",
    vector_to_array("scaled_vector")
)

# Create individual scaled columns from array elements
for i, c in enumerate(columns_to_scale):
    customers_features_scaled = customers_features_scaled.withColumn(f"scaled_{c}", col("scaled_array")[i])

display(customers_features_scaled.limit(3))

In [0]:
# Keep only the final feature columns and exclude the target label (is_churn)
final_cols = [
    "customer_id",
    "card_tier_encoded",
    "gender_female", "gender_male", "gender_other"
]

# Add all the newly created scaled columns
scaled_cols = [f"scaled_{c}" for c in columns_to_scale]

# Select customer_id, categorical features, and all individual scaled feature columns
customers_features_final = customers_features_scaled.select(final_cols + scaled_cols)


In [0]:

spark.sql("CREATE SCHEMA IF NOT EXISTS oms_analytics.feature_store")

fe = FeatureEngineeringClient()

feature_table_name = "oms_analytics.feature_store.customers_features"

# Drop the feature table if it exists
try:
    fe.drop_table(name=feature_table_name)
except:
    pass

# Create the feature table
fe.create_table(
    name=feature_table_name,
    primary_keys=["customer_id"],
    df=customers_features_final,
    description = """
    Preprocessed customer features for:
    - Churn prediction (Identify customers who might leave) 
    - Lifetime value (LTV) modeling 
    - Customer segmentation

    Includes: demographics, financials, engagement metrics, and satisfaction scores.
    Primary key: customer_id (unique identifier)        
    """)

In [0]:
display(spark.table("oms_analytics.feature_store.customers_features").limit(5))