In [0]:
# Import required libraries
from pyspark.sql.functions import col, mean, sum, desc
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# ---------------------------------
# Step 1: Load Silver Layer Data
# ---------------------------------
train_df = spark.read.format('delta')\
                    .option('inferSchema',True)\
                    .option('header',True)\
                    .load('dbfs:/mnt/walmartsilver/Final Silver/Train Silver/')
test_df = spark.read.format('delta')\
                    .option('inferSchema',True)\
                    .option('header',True)\
                    .load('dbfs:/mnt/walmartsilver/Final Silver/Test Silver/')


In [0]:
# Cast columns to Double
columns_to_cast = [
    "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5", 
    "CPI", "Unemployment", "Temperature", "Fuel_Price", "Size"
]

for column_name in columns_to_cast:
    train_df = train_df.withColumn(column_name, col(column_name).cast("double"))
    test_df = test_df.withColumn(column_name, col(column_name).cast("double"))

# Verify the data types
print("Training Data Schema:")
train_df.printSchema()

print("Testing Data Schema:")
test_df.printSchema()

In [0]:
# ---------------------------------
# Step 2: Prepare Data for Modeling
# ---------------------------------
# Feature selection: Columns used for the regression model
feature_columns = [
    "Temperature", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3", 
    "MarkDown4", "MarkDown5", "CPI", "Unemployment", "Size", "IsHoliday"
]

# Use VectorAssembler to combine feature columns into a single feature vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Prepare training data
train_data = assembler.transform(train_df).select("features", "Weekly_Sales")
train_data.show(5)

# Prepare test data
test_data = assembler.transform(test_df).select("features", "Store", "Dept", "Date", "IsHoliday")
test_data.show(5)

In [0]:
# ---------------------------------
# Step 3: Train the Regression Model
# ---------------------------------
# Initialize and train the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="Weekly_Sales", predictionCol="prediction")

# Fit the model
lr_model = lr.fit(train_data)

# Model evaluation on training data
training_summary = lr_model.summary
print("Model Training Summary:")
print(f"RMSE: {training_summary.rootMeanSquaredError}")
print(f"R2: {training_summary.r2}")

In [0]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# ---------------------------------
# Split train data into training and validation sets
# ---------------------------------
train_subset, validation_subset = train_data.randomSplit([0.8, 0.2], seed=42)

# Initialize Random Forest Regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="Weekly_Sales", numTrees=50, maxDepth=10)

# Train the model on train subset
rf_model = rf.fit(train_subset)

# Predict on validation set
validation_predictions = rf_model.transform(validation_subset)

# Evaluate the model on validation data
evaluator_rmse = RegressionEvaluator(labelCol="Weekly_Sales", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="Weekly_Sales", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(validation_predictions)
r2 = evaluator_r2.evaluate(validation_predictions)

print(f"Random Forest Model Evaluation on Validation Set:")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

In [0]:
# ---------------------------------
# Predict on test dataset (without Weekly_Sales)
# ---------------------------------
test_predictions = rf_model.transform(test_data)

# Select required columns for final output
final_predictions = test_predictions.select("Store", "Dept", "Date", "IsHoliday", col("prediction").alias("Predicted_Weekly_Sales"))

# Show test predictions
final_predictions.show(10)

In [0]:
from pyspark.sql.functions import col, when, lit

# ---------------------------
# Step 1: Simulate Current Stock Levels
# ---------------------------
# Assume a simulated current stock level for each store and department
# For demonstration, let's assign an arbitrary initial stock of 500 units for each department
current_stock_df = final_predictions.withColumn("Current_Stock", lit(500))

# ---------------------------
# Step 2: Calculate Remaining Stock and Reorder Flag
# ---------------------------
# Subtract predicted weekly sales from current stock to estimate remaining stock
stock_analysis_df = current_stock_df.withColumn(
    "Remaining_Stock", col("Current_Stock") - col("Predicted_Weekly_Sales")
)

# Flag departments where stock is below a reorder threshold (e.g., 100 units)
stock_analysis_df = stock_analysis_df.withColumn(
    "Reorder_Flag", when(col("Remaining_Stock") < 100, "Yes").otherwise("No")
)

# ---------------------------
# Step 3: Save Final Stock Analysis to Gold Layer
# ---------------------------
# Select relevant columns for output
final_stock_analysis = stock_analysis_df.select(
    "Store", "Dept", "Date", "IsHoliday", "Predicted_Weekly_Sales", "Current_Stock", 
    "Remaining_Stock", "Reorder_Flag"
)

# Show stock analysis
final_stock_analysis.show(10)




In [0]:
base_path = "/mnt/walmartgold/"

# Function to overwrite Delta files cleanly
def write_clean_delta(df, folder_name):
    path = f"{base_path}{folder_name}"
    # Remove the contents of the directory to ensure a clean overwrite
    files = dbutils.fs.ls(path)
    for file in files:
        dbutils.fs.rm(file.path, True)
    # Write the DataFrame in Delta format
    df.write.format("delta").mode("overwrite").save(path)

# Write each DataFrame to its respective folder
write_clean_delta(final_predictions, "Predicted Sales")
write_clean_delta(final_stock_analysis, "Stock Analysis")

In [0]:
# Save the DataFrame as a Delta table
final_stock_analysis.write.format("delta").mode("overwrite").saveAsTable("final_stock_analysis")
