In [1]:
# !pip install pyspark dash dash-core-components dash-html-components plotly pandas numpy

In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Predictive_Modeling") \
    .master("local[*]") \
    .getOrCreate()

print("✅ Spark Session Initialized!")


✅ Spark Session Initialized!


In [4]:
# test_df = spark.createDataFrame([(1, "test")], ["id", "value"])
# test_df.write.mode("overwrite").parquet("file:///C:/tmp/test_parquet")


In [5]:
# import sys
# print(sys.version)
# import pyspark
# print(pyspark.__version__)


In [6]:
import os
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Define your file path (Ensure this is the correct path)
LOCAL_FILE_PATH = "processed_data_bucket/processed_customer_purchase_behavior.csv"

# Define the schema of your processed data
schema = StructType([
    StructField("Customer ID", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("Item Purchased", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Purchase Amount (USD)", DoubleType(), True),
    StructField("Location", StringType(), True),
    StructField("Size", StringType(), True),
    StructField("Color", StringType(), True),
    StructField("Season", StringType(), True),
    StructField("Review Rating", DoubleType(), True),
    StructField("Subscription Status", IntegerType(), True),  
    StructField("Payment Method", StringType(), True),
    StructField("Shipping Type", StringType(), True),
    StructField("Discount Applied", IntegerType(), True),
    StructField("Promo Code Used", IntegerType(), True), 
    StructField("Previous Purchases", IntegerType(), True),  
    StructField("Preferred Payment Method", StringType(), True),
    StructField("Frequency of Purchases", StringType(), True),
    StructField("High Value Customer", StringType(), True)
])

# Read the processed data from local CSV file
processed_df = spark.read.option("header", "true").schema(schema).csv(LOCAL_FILE_PATH)

# Display schema and first few rows
print("✅ Processed Data Loaded Successfully!")
processed_df.printSchema()

✅ Processed Data Loaded Successfully!
root
 |-- Customer ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Item Purchased: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Purchase Amount (USD): double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Review Rating: double (nullable = true)
 |-- Subscription Status: integer (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Shipping Type: string (nullable = true)
 |-- Discount Applied: integer (nullable = true)
 |-- Promo Code Used: integer (nullable = true)
 |-- Previous Purchases: integer (nullable = true)
 |-- Preferred Payment Method: string (nullable = true)
 |-- Frequency of Purchases: string (nullable = true)
 |-- High Value Customer: string (nullable = true)



In [7]:
import pandas as pd
print(pd.__version__)


2.2.3


In [8]:
processed_df.limit(10).toPandas()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Payment Method,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Preferred Payment Method,Frequency of Purchases,High Value Customer
0,1,55,Male,Blouse,Clothing,53.0,Kentucky,L,Gray,Winter,3.1,1,Credit Card,Express,1,1,14,Venmo,Fortnightly,No
1,2,19,Male,Sweater,Clothing,64.0,Maine,L,Maroon,Winter,3.1,1,Bank Transfer,Express,1,1,2,Cash,Fortnightly,No
2,3,50,Male,Jeans,Clothing,73.0,Massachusetts,S,Maroon,Spring,3.1,1,Cash,Free Shipping,1,1,23,Credit Card,Weekly,No
3,4,21,Male,Sandals,Footwear,90.0,Rhode Island,M,Maroon,Spring,3.5,1,PayPal,Next Day Air,1,1,49,PayPal,Weekly,Yes
4,5,45,Male,Blouse,Clothing,49.0,Oregon,M,Turquoise,Spring,2.7,1,Cash,Free Shipping,1,1,31,PayPal,Annually,No
5,6,46,Male,Sneakers,Footwear,20.0,Wyoming,M,White,Summer,2.9,1,Venmo,Standard,1,1,14,Venmo,Weekly,No
6,7,63,Male,Shirt,Clothing,85.0,Montana,M,Gray,Fall,3.2,1,Debit Card,Free Shipping,1,1,49,Cash,Quarterly,Yes
7,8,27,Male,Shorts,Clothing,34.0,Louisiana,L,Charcoal,Winter,3.2,1,Debit Card,Free Shipping,1,1,19,Credit Card,Weekly,No
8,9,26,Male,Coat,Outerwear,97.0,West Virginia,L,Silver,Summer,2.6,1,Venmo,Express,1,1,8,Venmo,Annually,Yes
9,10,57,Male,Handbag,Accessories,31.0,Missouri,M,Pink,Spring,4.8,1,PayPal,2-Day Shipping,1,1,4,Cash,Quarterly,No


In [9]:
from pyspark.sql.functions import col, sum

# Count missing values in each column
missing_values = processed_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in processed_df.columns])
print("✅ Missing Values per Column:")
missing_values.limit(1).toPandas()


✅ Missing Values per Column:


Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Payment Method,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Preferred Payment Method,Frequency of Purchases,High Value Customer
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
from pyspark.sql.functions import countDistinct

# Count unique values for each column
unique_counts = processed_df.select([countDistinct(col(c)).alias(c) for c in processed_df.columns])

print("✅ Unique Value Counts per Column:")
unique_counts.limit(1).toPandas()


✅ Unique Value Counts per Column:


Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Payment Method,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Preferred Payment Method,Frequency of Purchases,High Value Customer
0,3900,53,2,25,4,81,50,4,25,4,26,2,6,6,2,2,50,6,7,2


In [11]:
categorical_features = ["Subscription Status", "Discount Applied", "Promo Code Used", "High Value Customer"]

for col_name in categorical_features:
    print(f"✅ Distribution for {col_name}:")
    processed_df.groupBy(col_name).count().show()


✅ Distribution for Subscription Status:
+-------------------+-----+
|Subscription Status|count|
+-------------------+-----+
|                  1| 1053|
|                  0| 2847|
+-------------------+-----+

✅ Distribution for Discount Applied:
+----------------+-----+
|Discount Applied|count|
+----------------+-----+
|               1| 1677|
|               0| 2223|
+----------------+-----+

✅ Distribution for Promo Code Used:
+---------------+-----+
|Promo Code Used|count|
+---------------+-----+
|              1| 1677|
|              0| 2223|
+---------------+-----+

✅ Distribution for High Value Customer:
+-------------------+-----+
|High Value Customer|count|
+-------------------+-----+
|                 No| 2974|
|                Yes|  926|
+-------------------+-----+



In [12]:
from pyspark.sql.functions import min, max, avg, percentile_approx

purchase_stats = processed_df.select(
    min("Purchase Amount (USD)").alias("Min_Purchase"),
    max("Purchase Amount (USD)").alias("Max_Purchase"),
    avg("Purchase Amount (USD)").alias("Mean_Purchase"),
    percentile_approx("Purchase Amount (USD)", 0.5).alias("Median_Purchase")
)

print("✅ Purchase Amount Statistics:")
purchase_stats.show()


✅ Purchase Amount Statistics:
+------------+------------+-----------------+---------------+
|Min_Purchase|Max_Purchase|    Mean_Purchase|Median_Purchase|
+------------+------------+-----------------+---------------+
|        20.0|       100.0|59.76435897435898|           60.0|
+------------+------------+-----------------+---------------+



In [13]:
quantiles = processed_df.approxQuantile("Purchase Amount (USD)", [0.25, 0.75], 0.05)
Q1, Q3 = quantiles[0], quantiles[1]
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = processed_df.filter(
    (col("Purchase Amount (USD)") < lower_bound) | (col("Purchase Amount (USD)") > upper_bound)
).count()

print(f"✅ Number of Outliers in 'Purchase Amount (USD)': {outliers}")


✅ Number of Outliers in 'Purchase Amount (USD)': 0


In [14]:
from pyspark.sql.functions import count

# Categorical columns to analyze
categorical_cols = ["Gender", "Item Purchased", "Category", "Location", "Size", "Color", 
                    "Season", "Payment Method", "Shipping Type", "Preferred Payment Method", "Frequency of Purchases"]

for col_name in categorical_cols:
    print(f"✅ Distribution for {col_name}:")
    processed_df.groupBy(col_name).agg(count("*").alias("count")).show(truncate=False)


✅ Distribution for Gender:
+------+-----+
|Gender|count|
+------+-----+
|Female|1248 |
|Male  |2652 |
+------+-----+

✅ Distribution for Item Purchased:
+--------------+-----+
|Item Purchased|count|
+--------------+-----+
|T-shirt       |147  |
|Jacket        |163  |
|Sneakers      |145  |
|Belt          |161  |
|Dress         |166  |
|Sweater       |164  |
|Hat           |154  |
|Coat          |161  |
|Sunglasses    |161  |
|Pants         |171  |
|Hoodie        |151  |
|Handbag       |153  |
|Gloves        |140  |
|Backpack      |143  |
|Shirt         |169  |
|Shoes         |150  |
|Blouse        |171  |
|Jewelry       |171  |
|Boots         |144  |
|Shorts        |157  |
+--------------+-----+
only showing top 20 rows

✅ Distribution for Category:
+-----------+-----+
|Category   |count|
+-----------+-----+
|Outerwear  |324  |
|Clothing   |1737 |
|Footwear   |599  |
|Accessories|1240 |
+-----------+-----+

✅ Distribution for Location:
+-------------+-----+
|Location     |count|
+-----

In [15]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

categorical_features = ["Item Purchased", "Category", "Location", "Size", "Color", 
                        "Season", "Payment Method", "Shipping Type", 
                        "Preferred Payment Method", "Frequency of Purchases"]

indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in categorical_features]
encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_encoded") for col in categorical_features]

pipeline = Pipeline(stages=indexers + encoders)
processed_df = pipeline.fit(processed_df).transform(processed_df)

# Drop original categorical columns after encoding
processed_df = processed_df.drop(*categorical_features)

print("✅ Categorical Encoding Completed!")
processed_df.printSchema()


✅ Categorical Encoding Completed!
root
 |-- Customer ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Purchase Amount (USD): double (nullable = true)
 |-- Review Rating: double (nullable = true)
 |-- Subscription Status: integer (nullable = true)
 |-- Discount Applied: integer (nullable = true)
 |-- Promo Code Used: integer (nullable = true)
 |-- Previous Purchases: integer (nullable = true)
 |-- High Value Customer: string (nullable = true)
 |-- Item Purchased_index: double (nullable = false)
 |-- Category_index: double (nullable = false)
 |-- Location_index: double (nullable = false)
 |-- Size_index: double (nullable = false)
 |-- Color_index: double (nullable = false)
 |-- Season_index: double (nullable = false)
 |-- Payment Method_index: double (nullable = false)
 |-- Shipping Type_index: double (nullable = false)
 |-- Preferred Payment Method_index: double (nullable = false)
 |-- Frequency of Purchases_index: double (nulla

In [16]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

# Assemble numeric features for scaling
numeric_features = ["Purchase Amount (USD)", "Review Rating"]
assembler = VectorAssembler(inputCols=numeric_features, outputCol="num_features")

# Apply Min-Max Scaling
scaler = MinMaxScaler(inputCol="num_features", outputCol="scaled_features")

pipeline = Pipeline(stages=[assembler, scaler])
processed_df = pipeline.fit(processed_df).transform(processed_df)

# Drop original numeric columns after scaling
processed_df = processed_df.drop(*numeric_features)

print("✅ Numeric Feature Scaling Completed!")
processed_df.printSchema()


✅ Numeric Feature Scaling Completed!
root
 |-- Customer ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Subscription Status: integer (nullable = true)
 |-- Discount Applied: integer (nullable = true)
 |-- Promo Code Used: integer (nullable = true)
 |-- Previous Purchases: integer (nullable = true)
 |-- High Value Customer: string (nullable = true)
 |-- Item Purchased_index: double (nullable = false)
 |-- Category_index: double (nullable = false)
 |-- Location_index: double (nullable = false)
 |-- Size_index: double (nullable = false)
 |-- Color_index: double (nullable = false)
 |-- Season_index: double (nullable = false)
 |-- Payment Method_index: double (nullable = false)
 |-- Shipping Type_index: double (nullable = false)
 |-- Preferred Payment Method_index: double (nullable = false)
 |-- Frequency of Purchases_index: double (nullable = false)
 |-- Item Purchased_encoded: vector (nullable = true)
 |-- Category_encoded: vecto

## Feature Selection

In [17]:
from pyspark.ml.feature import VectorAssembler

# Selecting encoded categorical & scaled numeric features
feature_cols = [
    "Item Purchased_encoded", "Category_encoded", "Location_encoded", "Size_encoded", 
    "Color_encoded", "Season_encoded", "Payment Method_encoded", "Shipping Type_encoded", 
    "Preferred Payment Method_encoded", "Frequency of Purchases_encoded", "scaled_features"
]

# Assemble all features into one vector
feature_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Transform data
processed_df = feature_assembler.transform(processed_df)

# Drop redundant columns
columns_to_drop = [
    "Customer ID", "Gender", "Item Purchased_index", "Category_index", "Location_index", "Size_index",
    "Color_index", "Season_index", "Payment Method_index", "Shipping Type_index",
    "Preferred Payment Method_index", "Frequency of Purchases_index", 
    "Item Purchased_encoded", "Category_encoded", "Location_encoded", "Size_encoded", "Color_encoded",
    "Season_encoded", "Payment Method_encoded", "Shipping Type_encoded", "Preferred Payment Method_encoded",
    "Frequency of Purchases_encoded", "scaled_features", "num_features"
]

processed_df = processed_df.drop(*columns_to_drop)

print("✅ Features Assembled & Unnecessary Columns Dropped!")
processed_df.printSchema()


✅ Features Assembled & Unnecessary Columns Dropped!
root
 |-- Age: integer (nullable = true)
 |-- Subscription Status: integer (nullable = true)
 |-- Discount Applied: integer (nullable = true)
 |-- Promo Code Used: integer (nullable = true)
 |-- Previous Purchases: integer (nullable = true)
 |-- High Value Customer: string (nullable = true)
 |-- features: vector (nullable = true)



In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml import Pipeline


In [19]:
# ✅ Convert target column to numeric (needed for ML)
indexer = StringIndexer(inputCol="High Value Customer", outputCol="label")
processed_df = indexer.fit(processed_df).transform(processed_df).drop("High Value Customer")

# ✅ Train-Test Split (80-20)
train_data, test_data = processed_df.randomSplit([0.8, 0.2], seed=42)
print(f"✅ Data Split Completed: Train ({train_data.count()}), Test ({test_data.count()})")


✅ Data Split Completed: Train (3177), Test (723)


In [20]:
# Define your classifiers
models = {
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50),
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="label", maxIter=20),
}

# Define evaluation metrics
evaluators = {
    "Accuracy": MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy"),
    "Precision": MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision"),
    "Recall": MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall"),
    "F1 Score": MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1"),
    "AUC": BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC"),
}

In [21]:
# Define a function to train, evaluate, and optionally save predictions.
def evaluate_model(model_name, model, train_data, test_data, save_path=None):
    """
    Trains a model using a pipeline, evaluates performance metrics, and saves predictions if save_path is provided.
    Returns a tuple: (model_name, Accuracy, Precision, Recall, F1 Score, AUC)
    """
    pipeline = Pipeline(stages=[model])
    trained_model = pipeline.fit(train_data)
    predictions = trained_model.transform(test_data)
    
    # Evaluate using each evaluator
    metrics = {metric_name: evaluator.evaluate(predictions) 
               for metric_name, evaluator in evaluators.items()}
    
    print(f"\n🔹 {model_name} Performance Metrics:")
    for metric, value in metrics.items():
        print(f"✅ {metric}: {value:.4f}")
    
    if save_path is not None:
        # Create a filename using the model name (replace spaces with underscores)
        filename = f"{model_name.replace(' ', '_')}_predictions.parquet"
        full_path = f"{save_path}/{filename}"
        predictions.write.mode("overwrite").parquet(full_path)
        print(f"✅ Predictions for {model_name} saved to: {full_path}")
    
    return (model_name, *metrics.values())

# First, evaluate models without saving predictions.
model_results = [evaluate_model(name, model, train_data, test_data) for name, model in models.items()]


🔹 Random Forest Performance Metrics:
✅ Accuracy: 0.9281
✅ Precision: 0.9345
✅ Recall: 0.9281
✅ F1 Score: 0.9240
✅ AUC: 0.9991

🔹 Logistic Regression Performance Metrics:
✅ Accuracy: 0.9723
✅ Precision: 0.9723
✅ Recall: 0.9723
✅ F1 Score: 0.9723
✅ AUC: 0.9974


In [24]:
# Convert the Python list of model results to a Pandas DataFrame for easier comparison
import pandas as pd

results_pd = pd.DataFrame(model_results, columns=["Model", *evaluators.keys()])
print("✅ Model Comparison Results:")
print(results_pd)

# (Optional) Save the comparison results to a CSV file for record-keeping
results_pd.to_csv("model_comparison_results.csv", index=False)
print("✅ Model comparison results saved to model_comparison_results.csv")

# Select the best model based on the highest AUC
best_model_row = results_pd.loc[results_pd["AUC"].idxmax()]
best_model_name = best_model_row["Model"]
print(f"\n🔹 Best model based on AUC is: {best_model_name}")

# Define the path to save predictions (adjust for local or IBM Cloud Watson Studio)
save_predictions_path = "/tmp/predictions"  # For local execution, change if needed

# Re-run evaluation for the best model with predictions saved to the specified path
best_model_result = evaluate_model(best_model_name, models[best_model_name], train_data, test_data, save_path=save_predictions_path)

✅ Model Comparison Results:
                 Model  Accuracy  Precision    Recall  F1 Score       AUC
0        Random Forest  0.928077   0.934460  0.928077  0.924011  0.999069
1  Logistic Regression  0.972337   0.972337  0.972337  0.972337  0.997384
✅ Model comparison results saved to model_comparison_results.csv

🔹 Best model based on AUC is: Random Forest

🔹 Random Forest Performance Metrics:
✅ Accuracy: 0.9281
✅ Precision: 0.9345
✅ Recall: 0.9281
✅ F1 Score: 0.9240
✅ AUC: 0.9991


Py4JJavaError: An error occurred while calling o2676.parquet.
: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:334)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:404)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
	at org.apache.parquet.hadoop.ParquetOutputCommitter.commitJob(ParquetOutputCommitter.java:48)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$writeAndCommit$3(FileFormatWriter.scala:275)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:552)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:275)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:802)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
