In [None]:
# from pyspark.sql import SparkSession
# from pyspark.ml.clustering import GaussianMixture
# from pyspark.ml.linalg import Vectors
# from pyspark.ml.feature import VectorAssembler
# from sklearn.datasets import load_wine

# spark = SparkSession.builder.appName("GaussianMixtureExample").getOrCreate()

# data = load_wine()
# df = spark.createDataFrame(data.data.tolist(), schema=data.feature_names)

# assembler = VectorAssembler(inputCols=df.columns, outputCol="features")
# dataset = assembler.transform(df)

# gmm = GaussianMixture(k=10, tol=0.01, maxIter=100)
# model = gmm.fit(dataset)

# transformed = model.transform(dataset)
# transformed.show()

# # evaluasi model (pakai silhouette score)
# from pyspark.ml.evaluation import ClusteringEvaluator
# evaluator = ClusteringEvaluator()
# silhouette = evaluator.evaluate(transformed)
# print("Silhouette Score: ", silhouette)

# spark.stop()

In [15]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import GaussianMixture
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator
import time

# Initialize Spark Session
spark = SparkSession.builder.appName("GaussianMixtureModelCreditCard").getOrCreate()

# Load data
data_path = "/home/bigdata/project-folder/Eksplorasi/CC_GENERAL.csv"  # Replace with the actual path to your dataset
start_time = time.time()  # Start timing
data = spark.read.csv(data_path, header=True, inferSchema=True)

# Select features for clustering
assembler = VectorAssembler(inputCols=["BALANCE", "PURCHASES", "PAYMENTS"], outputCol="features")
data = assembler.transform(data)

# Create Gaussian Mixture Model
gmm = GaussianMixture(k=3, maxIter=100, tol=0.01, seed=10)
model = gmm.fit(data)

# Evaluate clustering by computing Silhouette score
transformed = model.transform(data)
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(transformed)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Show results
transformed.select("features", "prediction").show()

# Print duration
print("Duration: %s seconds" % (time.time() - start_time))

# Stop Spark Session
spark.stop()

Silhouette with squared euclidean distance = 0.7259416875600356
+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[40.900749,95.4,2...|         0|
|[3202.467416,0.0,...|         2|
|[2495.148862,773....|         0|
|[1666.670542,1499...|         0|
|[817.714335,16.0,...|         0|
|[1809.828751,1333...|         0|
|[627.260806,7091....|         2|
|[1823.652743,436....|         0|
|[1014.926473,861....|         0|
|[152.225975,1281....|         0|
|[1293.124939,920....|         0|
|[630.794744,1492....|         0|
|[1516.92862,3217....|         0|
|[921.693369,2137....|         0|
|[2772.772734,0.0,...|         0|
|[6886.213231,1611...|         2|
|[2072.074354,0.0,...|         0|
|[41.089489,519.0,...|         0|
|[1989.072228,504....|         0|
|[3577.970933,398....|         0|
+--------------------+----------+
only showing top 20 rows

Duration: 23.69611668586731 seconds


In [14]:
spark.stop()