# 잠재요인기반 추천 시스템 ALS 모델 만들기

1. 필요한 라이브러리 불러오기 
2. 데이터 준비하기 
    - 데이터 파일 불러오기
    - 데이터 전처리
3. 데이터 분할
4. ALS 모델 생성 및 설정
5. 모델 학습
6. 추천 및 평가
7. 영화 추천

In [1]:
from pyspark.sql import SparkSession

MAX_MEMORY = "5g"
spark= SparkSession.builder.appName("20241213_01_MLlib_ALS")\
                    .config("spark.executor.momory", "MAX_MEMORY")\
                    .config("spark.driver.momory", "MAX_MEMORY")\
                    .getOrCreate()
spark

24/12/13 15:33:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/13 15:33:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
rating_df = spark.read.format("csv")\
    .option("header",'true')\
    .option('inferSchema', 'true')\
    .load("data/ratings.csv")

movie_df = spark.read.format("csv")\
    .option("header",'true')\
    .option('inferSchema', 'true')\
    .load("data/movies.csv")

                                                                                

In [3]:
rating_df.printSchema()
movie_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [4]:
rating_df.show(5)
movie_df.show(5)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
+------+-------+------+----------+
only showing top 5 rows

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



# 전처리 

- timestamp 제거

In [5]:
rating_df = rating_df.select(["userId", "movieId", "rating"])
rating_df.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    296|   5.0|
|     1|    306|   3.5|
|     1|    307|   5.0|
|     1|    665|   5.0|
|     1|    899|   3.5|
|     1|   1088|   4.0|
|     1|   1175|   3.5|
|     1|   1217|   3.5|
|     1|   1237|   5.0|
|     1|   1250|   4.0|
|     1|   1260|   3.5|
|     1|   1653|   4.0|
|     1|   2011|   2.5|
|     1|   2012|   2.5|
|     1|   2068|   2.5|
|     1|   2161|   3.5|
|     1|   2351|   4.5|
|     1|   2573|   4.0|
|     1|   2632|   5.0|
|     1|   2692|   5.0|
+------+-------+------+
only showing top 20 rows



# 데이터 세트 분할

In [6]:
train_ratio = 0.8
test_ratio = 0.2

train_df, test_df = rating_df.randomSplit([train_ratio, test_ratio],seed=42)

# ALS 모델 객체 생성

In [7]:
from pyspark.ml.recommendation import ALS

# ALS 모델 생성
als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy='drop' # 결측값 방지
)

In [8]:
# 학습
als_model = als.fit(train_df)
als_model

24/12/13 15:34:46 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/13 15:34:46 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/12/13 15:34:48 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/12/13 15:34:48 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
                                                                                

ALSModel: uid=ALS_9d594524b2b4, rank=10

# 예측 확인

In [9]:
#userid와 movieid 를 가지고 rating 예측
predictions = als_model.transform(test_df)
predictions.show()

                                                                                

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
| 26480|    148|   2.0| 2.0360024|
|151614|    148|   1.0| 2.7519841|
| 28229|    148|   1.0|  2.471971|
|  6491|    148|   4.0| 2.5011795|
| 14831|    148|   3.0|  2.757146|
|145182|    148|   3.0| 2.5851796|
| 69123|    148|   4.5| 2.4459808|
|104825|    148|   4.0| 3.0652335|
| 41703|    148|   2.0| 2.9181533|
| 29213|    148|   5.0| 2.5555177|
| 75209|    148|   2.0| 2.6499124|
|115912|    148|   3.0| 2.5922348|
|118261|    148|   3.0| 3.0580463|
|138552|    148|   4.0| 3.4104571|
| 70733|    148|   1.0| 2.9010627|
|115095|    148|   4.0| 3.4735572|
|  7223|    148|   3.0| 2.5110245|
| 65981|    148|   3.5| 2.9380302|
| 74794|    148|   3.0| 2.5544858|
| 33145|    148|   5.0| 2.9508266|
+------+-------+------+----------+
only showing top 20 rows



In [10]:
predictions.select("rating","prediction").describe().show()

                                                                                

+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4998109|           4998109|
|   mean|3.5341648211353536| 3.421954467241195|
| stddev|1.0609230261741123|0.6440288205455679|
|    min|               0.5|        -0.9586964|
|    max|               5.0|          6.440156|
+-------+------------------+------------------+



# ALS 모델의 하이퍼파라미터 조정

- coldStartStrategy = "drop" > 예측에 충분한지 않은 컬럼의 데이터의 경우 "drop"

학습한 모델의 속성을 변경 > 다시 예측을 한 경우에도 조정이 안됨 > 학습부터 다시 실행

In [11]:
als.setColdStartStrategy("drop")

ALS_9d594524b2b4

In [12]:
#userid와 movieid 를 가지고 rating 예측
predictions = als_model.transform(test_df)
predictions.show(5)



+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
| 26480|    148|   2.0| 2.0360024|
|151614|    148|   1.0| 2.7519841|
| 28229|    148|   1.0|  2.471971|
|  6491|    148|   4.0| 2.5011795|
| 14831|    148|   3.0|  2.757146|
+------+-------+------+----------+
only showing top 5 rows



                                                                                

In [13]:
predictions.select("rating", "prediction").describe().show()



+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4998109|           4998109|
|   mean|3.5341648211353536| 3.421954467241195|
| stddev|1.0609230261741123|0.6440288205455679|
|    min|               0.5|        -0.9586964|
|    max|               5.0|          6.440156|
+-------+------------------+------------------+





# 평가

- RMSE 측정

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [16]:
rmse = evaluator.evaluate(predictions)
rmse

                                                                                

0.8111499858106095

# 활용 - 추천 생성

1. 사용자id 를 넣으면 추천목록 추천 
2. 영화id 를 넣으면 사용자목록 추천

In [41]:
recommendForAllUsers = als_model.recommendForAllUsers(3) # item별로 추천할 영화 3명 추출
recommendForAllUsers.show()

KeyboardInterrupt



In [None]:
recommendForAllItems = als_model.recommendForAllItems(3) # item별로 추천할 유저 3명 추출
recommendForAllItems.show()

In [24]:
from pyspark.sql.types import IntegerType

user_list = [65,78,81] # 추천을 생성할 사용자 목록 정의 (사용자 ID 리스트)
user_df = spark.createDataFrame(user_list, IntegerType()).toDF("userId") # 사용자 ID 리스트를 데이터프레임으로 변환
user_df.show()

+------+
|userId|
+------+
|    65|
|    78|
|    81|
+------+



In [43]:
# ALS 모델을 이용해 지정된 사용자(user_list)에게 4개의 영화 추천 생성
user_recommend_movies = als_model.recommendForUserSubset(user_df,4)
user_recommend_movies.show()

                                                                                

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    65|[{194434, 6.72186...|
|    78|[{194434, 7.17550...|
|    81|[{174805, 4.71085...|
+------+--------------------+



In [42]:
# movie_list load
movie_df.show(3)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



In [44]:
# 65번 사용자의 추천 영화 목록 추출
# `recommendations` 컬럼에 추천 영화 ID와 예상 평점이 리스트 형태로 저장됨
movie_list = user_recommend_movies.collect()[0].recommendations
movie_list

                                                                                

[Row(movieId=194434, rating=6.721865177154541),
 Row(movieId=192261, rating=6.6192827224731445),
 Row(movieId=202231, rating=6.507192134857178),
 Row(movieId=205277, rating=6.279325008392334)]

In [45]:
# 추천 영화 목록을 DataFrame으로 변환 (movieId와 rating 정보 포함)
rec_df = spark.createDataFrame(movie_list)
rec_df.show()

+-------+------------------+
|movieId|            rating|
+-------+------------------+
| 194434| 6.721865177154541|
| 192261|6.6192827224731445|
| 202231| 6.507192134857178|
| 205277| 6.279325008392334|
+-------+------------------+



In [46]:
# 영화정보와 추천 영화 목록을 조인 # 추천 영화 목록(rec_df)을 임시 뷰로 등록
rec_df.createOrReplaceTempView('recommend')
movie_df.createOrReplaceTempView('movies')

In [36]:
# SQL 쿼리를 작성하여 영화 정보와 추천 영화 목록을 조인
query = '''
SELECT *
FROM movies JOIN recommend ON movies.movieId = recommend.movieId
ORDER BY rating DESC
'''

In [37]:
recommend_movies = spark.sql(query) # SQL 쿼리를 실행하여 최종 추천 영화 목록 생성
recommend_movies.show() # 추천 영화 목록을 출력

+-------+--------------------+------------------+-------+------------------+
|movieId|               title|            genres|movieId|            rating|
+-------+--------------------+------------------+-------+------------------+
| 194434|   Adrenaline (1990)|(no genres listed)| 194434| 6.721865177154541|
| 192261|Don't Laugh at My...|      Comedy|Drama| 192261|6.6192827224731445|
| 202231|       Foster (2018)|       Documentary| 202231| 6.507192134857178|
+-------+--------------------+------------------+-------+------------------+



In [47]:
spark.stop()