In [0]:
# Define the base file path as a parameter
base_file_path = "file:/Workspace/Users/yasodhashree91@gmail.com/oms-databricks/04_AI_ML/feature_eng/data_files/"

# Load all data from CSV files
customers_df = spark.read.csv(base_file_path + "customers.csv", header=True)
transactions_df = spark.read.csv(base_file_path + "transactions.csv", header=True)
interactions_df = spark.read.csv(base_file_path + "interactions.csv", header=True)

# Optionally display the first 5 rows of each DataFrame
print("Customers Data:")
customers_df.limit(5).display()

print("Transactions Data:")
transactions_df.limit(5).display()

print("Interactions Data:")
interactions_df.limit(5).display()


In [0]:
# Function to rename columns to be consistent and compatible with various ML algorithms and Databricks Feature Store
def renameColumns(df):
    renamed_df = df
    for column in df.columns:
        clean_name = column.replace(' ', '_').replace('/', '_').replace('-', '_')
        renamed_df = renamed_df.withColumnRenamed(column, clean_name)
    return renamed_df


# Run function for each soruce dataframe
customers_df = renameColumns(customers_df)
transactions_df = renameColumns(transactions_df)
interactions_df = renameColumns(interactions_df)

In [0]:
# Imports
from pyspark.sql.functions import avg

# Calculate the average values for age and credit_score to be used where they are missing
avg_age = customers_df.select(avg("age")).first()[0]
avg_score = customers_df.select(avg("credit_score")).first()[0]

# Fill null values for age and credit_score with their respective averages
customers_df = customers_df.fillna({
    "age": avg_age,
    "credit_score": avg_score
})

# Skip rows where income_range is null
customers_df = customers_df.filter(customers_df["income_range"].isNotNull())

In [0]:
from pyspark.sql.functions import sum as spark_sum, count

txn_agg = transactions_df.groupBy("customer_id").agg(
    spark_sum("amount").alias("total_spent"),
    count("transaction_id").alias("txn_count")
)
customers_features = customers_df.join(txn_agg, on="customer_id", how="left").fillna(0)

In [0]:
from pyspark.sql.functions import avg, sum, when

interaction_agg = interactions_df.groupBy("customer_id").agg(
    avg("satisfaction_score").alias("avg_satisfaction"),
    sum(when(interactions_df.reason == "Complaint", 1).otherwise(0)).alias("complaint_count")
)

customers_features_df = customers_features_df.join(interaction_agg, on="customer_id", how="left").fillna(0)