# MLlib - Clustering - KMeans 
- 데이터 출처 : 

In [3]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName("20241213_01_MLlib").getOrCreate()
spark

In [54]:
# 데이터 생성
data = [
    (0, 0, 4.0),  # user 0 rated item 0 with 4.0
    (0, 1, 2.0),
    (1, 1, 3.0),
    (1, 2, 1.0),
    (2, 0, 5.0),
    (2, 2, 4.0),
    (3, 0, 4.0), 
    (3, 3, 2.0),
    (4, 1, 3.0),
    (4, 2, 1.0),
    (5, 2, 5.0),
    (5, 3, 4.0)
]

columns = ["user_id", "item_id","rating"]

In [55]:
rating_df = spark.createDataFrame(data,columns)
rating_df.show()

+-------+-------+------+
|user_id|item_id|rating|
+-------+-------+------+
|      0|      0|   4.0|
|      0|      1|   2.0|
|      1|      1|   3.0|
|      1|      2|   1.0|
|      2|      0|   5.0|
|      2|      2|   4.0|
|      3|      0|   4.0|
|      3|      3|   2.0|
|      4|      1|   3.0|
|      4|      2|   1.0|
|      5|      2|   5.0|
|      5|      3|   4.0|
+-------+-------+------+



In [56]:
# user, item -rating 정보들 >> 사용자 그룹을 만든다 > 모델 > 예측 결과 - 그룹


In [57]:
# 피처선택, 전처리 > 모델 학습 > 예측 

In [58]:
user_item_matrix = rating_df.groupby("user_id").pivot("item_id").avg("rating").fillna(0)

In [59]:
user_item_matrix.show()



+-------+---+---+---+---+
|user_id|  0|  1|  2|  3|
+-------+---+---+---+---+
|      0|4.0|2.0|0.0|0.0|
|      5|0.0|0.0|5.0|4.0|
|      1|0.0|3.0|1.0|0.0|
|      3|4.0|0.0|0.0|2.0|
|      2|5.0|0.0|4.0|0.0|
|      4|0.0|3.0|1.0|0.0|
+-------+---+---+---+---+



# 피처 백처

In [60]:
from pyspark.ml.feature import VectorAssembler

In [61]:
assembler = VectorAssembler(inputCols=["0","1","2"], outputCol="features")

In [62]:
user_features = assembler.transform(user_item_matrix)
user_features.show()



+-------+---+---+---+---+-------------+
|user_id|  0|  1|  2|  3|     features|
+-------+---+---+---+---+-------------+
|      0|4.0|2.0|0.0|0.0|[4.0,2.0,0.0]|
|      5|0.0|0.0|5.0|4.0|[0.0,0.0,5.0]|
|      1|0.0|3.0|1.0|0.0|[0.0,3.0,1.0]|
|      3|4.0|0.0|0.0|2.0|[4.0,0.0,0.0]|
|      2|5.0|0.0|4.0|0.0|[5.0,0.0,4.0]|
|      4|0.0|3.0|1.0|0.0|[0.0,3.0,1.0]|
+-------+---+---+---+---+-------------+



# 모델 생성 > 학습

In [63]:
from pyspark.ml.clustering import KMeans

# 모델 생성
kmeans = KMeans(k=2, seed=1, featuresCol="features", predictionCol="cluster")
kmeans

KMeans_585f82c8dd70

In [64]:
#모델 학습 
model = kmeans.fit(user_features)
model

                                                                                

KMeansModel: uid=KMeans_585f82c8dd70, k=2, distanceMeasure=euclidean, numFeatures=3

In [65]:
# 모델을 이용한 예측
clusters = model.transform(user_features)
clusters.orderBy("user_id").show()



+-------+---+---+---+---+-------------+-------+
|user_id|  0|  1|  2|  3|     features|cluster|
+-------+---+---+---+---+-------------+-------+
|      0|4.0|2.0|0.0|0.0|[4.0,2.0,0.0]|      0|
|      1|0.0|3.0|1.0|0.0|[0.0,3.0,1.0]|      1|
|      2|5.0|0.0|4.0|0.0|[5.0,0.0,4.0]|      0|
|      3|4.0|0.0|0.0|2.0|[4.0,0.0,0.0]|      0|
|      4|0.0|3.0|1.0|0.0|[0.0,3.0,1.0]|      1|
|      5|0.0|0.0|5.0|4.0|[0.0,0.0,5.0]|      1|
+-------+---+---+---+---+-------------+-------+



                                                                                

In [66]:
## 사용자 그룹화 : 유사한 취향의 사용자끼리 그룹으로 묶어두는 것
## 아이템 그룹화 : 아이템 간의 군집화를 통해 사용자에게 추천해 줄 수 있음

In [67]:
spark.stop()