In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np
from pyspark.mllib.clustering import KMeans
from numpy import array, random
from math import sqrt
from pyspark import SparkConf, SparkContext

In [None]:


# Set random seed for reproducibility
np.random.seed(2023)

# Generate cluster data
def generate_cluster_data(num_samples, num_features, num_clusters, cluster_std):
    X, y = make_blobs(
        n_samples=num_samples,
        n_features=num_features,
        centers=num_clusters,
        cluster_std=cluster_std,
        random_state=0
    )
    return X, y

# Parameters for cluster data generation
num_samples = 5000  # Total number of samples
num_features = 4  # Number of features (dimensions)
num_clusters = 4  # Number of clusters
cluster_std = 1.0  # Standard deviation of each cluster

# Generate cluster data
X, y = generate_cluster_data(num_samples, num_features, num_clusters, cluster_std)

# Plot the generated data
plt.scatter(X[:, 0], X[:,1], c=y, cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Clustered Data')
plt.show()

In [None]:

K = 5

# Set up Spark configuration and context
conf = SparkConf().setMaster("local").setAppName("SparkKMeans")
sc = SparkContext(conf=conf)


# Load the data
data = sc.parallelize(X)



# Build the model (cluster the data)
clusters = KMeans.train(data, K, maxIterations=10,
           initializationMode="random")

# Print out the cluster assignments
resultRDD = data.map(lambda point: clusters.predict(point)).cache()

print("Counts by value:")
counts = resultRDD.countByValue()
#print(counts)


print("Cluster assignments:")
results = resultRDD.collect()
#print(results)

# Calculate Within Set Sum of Squared Error (WSSSE)
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))
