In [None]:


from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("PimaKMeansClustering").getOrCreate()

# Step 2: Load dataset
file_path = "pima.csv"  # Adjust if needed
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Choose features (plas: glucose, pres: blood pressure)
features = ["plas", "pres"]

# Step 4: Assemble features
assembler = VectorAssembler(inputCols=features, outputCol="features")
assembled_data = assembler.transform(df)

# Step 5: Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scaler_model = scaler.fit(assembled_data)
scaled_data = scaler_model.transform(assembled_data)

# Step 6: Elbow Method
wssse_list = []
k_values = list(range(2, 11))

for k in k_values:
    kmeans = KMeans(featuresCol="scaledFeatures", k=k)
    model = kmeans.fit(scaled_data)
    wssse = model.summary.trainingCost
    wssse_list.append(wssse)
    print(f"k={k}, WSSSE={wssse}")


# Step 8: Choose optimal k (based on elbow) — you can update this value manually
optimal_k = 3  # Change if needed based on elbow plot

# Step 9: Fit KMeans with optimal k
kmeans = KMeans(featuresCol="scaledFeatures", k=optimal_k)
model = kmeans.fit(scaled_data)
clusters = model.transform(scaled_data)

# Step 10: Show predictions
clusters.select("plas", "pres", "prediction").show(10)

# Step 11: Convert to Pandas for plotting
clusters_pd = clusters.select("plas", "pres", "prediction").toPandas()

# Step 12: Get and inverse-transform centroids
std_values = scaler_model.std.toArray()
scaled_centers = np.array(model.clusterCenters())
original_centers = scaled_centers * std_values  # Inverse scaling

# Step 13: Scatter plot with centroids
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x="plas", y="pres", hue="prediction", palette="viridis", data=clusters_pd
)
plt.scatter(
    original_centers[:, 0], original_centers[:, 1],
    color='red', marker='X', s=200, label="Centroids"
)
plt.title(f"K-Means Clustering on Pima Dataset (k={optimal_k})")
plt.xlabel("Plasma Glucose (plas)")
plt.ylabel("Blood Pressure (pres)")
plt.legend()
plt.show()

# Step 14: Stop Spark session
spark.stop()
