In [None]:
#172.18.0.4
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
import os

In [None]:
CATALOG_URI = "http://nessie:19120/api/v1"  # Nessie Server URI
WAREHOUSE = "s3://warehouse/"               # Minio Address to Write to
STORAGE_URI = "http://172.18.0.4:9000"     # Minio IP address from docker inspect

In [None]:
conf = (
    pyspark.SparkConf()
        .setAppName('billing')
    
    # .set("spark.executor.instances", "3")  # 3 Executors
    # .set("spark.executor.cores", "3")  # 3 Cores per executor
    # .set("spark.executor.memory", "3g")  # 3GB RAM per executor
    # .set("spark.driver.memory", "4g")  # Ensure driver has enough memory
    # .set("spark.sql.shuffle.partitions", "200")  # Tune for large joins
    # .set("spark.default.parallelism", "9")  # Optimize parallel processing
    # .set("spark.executor.instances", "3")  # 3 Executors
    # .set("spark.executor.cores", "2")  # 3 Cores per executor
    # .set("spark.executor.memory", "6g")  # 3GB RAM per executor
    # .set("spark.driver.memory", "6g")  # Ensure driver has enough memory
    # .set("spark.sql.shuffle.partitions", "200")  # Tune for large joins
    # .set("spark.default.parallelism", "200")  # Optimize parallel processing
    # .set("spark.memory.fraction", "0.7")  #
    # .set("spark.executor.memoryOverhead", "1024") 
#     .set("spark.dynamicAllocation.enabled", "true")
# .set("spark.shuffle.service.enabled", "true")
        .set("spark.sql.debug.maxToStringFields", "100000")
        .set('spark.jars', '/opt/spark/workjars/iceberg-spark-runtime-3.5_2.12-1.5.0.jar,/opt/spark/workjars/nessie-spark-extensions-3.5_2.12-0.77.1.jar,/opt/spark/workjars/bundle-2.24.8.jar,/opt/spark/workjars/url-connection-client-2.24.8.jar')
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', CATALOG_URI)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        .set('spark.sql.catalog.nessie.s3.endpoint', STORAGE_URI)
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
        .set("spark.executor.memory", "2g")
        .set("spark.driver.memory", "2g")
        .set("spark.executor.memoryOverhead", "512m")
        .set("spark.sql.shuffle.partitions", "64")
        .set("spark.shuffle.spill", "true")
        .set("spark.shuffle.memoryFraction", "0.4"))


In [None]:
spark = SparkSession.builder.master("local[*]").config(conf=conf).getOrCreate()
print("Spark Session Started")

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql import functions as F

# Load the data (assuming you have already loaded it into a DataFrame)
df = spark.read.table("nessie.ageing.ageing_data_raw")

# Index the "AccountContract" column to convert it to a numeric feature
indexer = StringIndexer(inputCol="AccountContract", outputCol="AccountContractIndex")
indexed_df = indexer.fit(df).transform(df)

# Select relevant features for clustering (you can add more columns here if needed)
features = indexed_df.select("AccountContractIndex")

# Perform KMeans clustering
kmeans = KMeans(k=3, seed=1, featuresCol="AccountContractIndex", predictionCol="prediction")
model = kmeans.fit(features)

# Make predictions
predictions = model.transform(features)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print(f"Silhouette with squared Euclidean distance = {silhouette}")

# Show the cluster centers
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
