In [1]:
from pyspark.sql import SparkSession

# 메모리 에러가 날 수 있어서 사전에 메모리 지정 
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("movie-recommendation")\
    .config("spark.executor.memory", MAX_MEMORY)\
    .config("spark.driver.memory", MAX_MEMORY)\
    .getOrCreate()

23/03/26 22:19:52 WARN Utils: Your hostname, Moon-3.local resolves to a loopback address: 127.0.0.1; using 192.168.0.2 instead (on interface en0)
23/03/26 22:19:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/26 22:19:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
ratings_file = "/Users/sig6774/Desktop/Data_Engineering/data-engineering-main/01-spark/data/ml-25m/ratings.csv"
ratings_df = spark.read.csv(f"file:///{ratings_file}", inferSchema=True, header=True)

                                                                                

In [3]:
ratings_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [4]:
ratings_df = ratings_df.select(["userId", "movieId", "rating"])
# 불러온 dataframe에서 특정 컬럼만 추출 
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [5]:
ratings_df.select("rating").describe().show()
# rating라는 컬럼의 통계값 확인 



+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          25000095|
|   mean| 3.533854451353085|
| stddev|1.0607439611423508|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



                                                                                

In [6]:
# split data 
train_df, test_df = ratings_df.randomSplit([0.8, 0.2])

In [7]:
from pyspark.ml.recommendation import ALS
# 추천 알고리즘 
als = ALS(
        maxIter=5, 
        regParam=0.1,
        userCol="userId",
        itemCol="movieId",
        ratingCol="rating",
        coldStartStrategy="drop"
        # 학습하지 못한 데이터를 만났을 때 어떻게 대처해야하는지에 대한 param
        )
# 해당 알고리즘을 적용하기 위해 필요한 param 적용 

In [8]:
model = als.fit(train_df)
# 메모리 에러가 날 수 있음 
# spark session을 생성 할 때 메모리 지정해줘야함 

                                                                                

23/03/26 22:20:34 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/03/26 22:20:34 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS




23/03/26 22:20:35 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

In [10]:
pred = model.transform(test_df)
pred.show()

                                                                                

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    31|   6620|   1.5|  2.516371|
|    76|   1959|   5.0| 3.6269543|
|   243|   1580|   3.0| 2.7584443|
|   243|  44022|   3.0| 2.4128227|
|   321|   1580|   3.0|    3.1143|
|   322|   1645|   4.0| 3.5236678|
|   406|   1088|   5.0| 3.5911279|
|   458|   1580|   3.5| 3.2566917|
|   472|   3918|   3.0| 2.4620767|
|   481|   1580|   4.0| 3.6883478|
|   606|   1645|   3.0|  4.275316|
|   606|  68135|   3.5| 3.9147344|
|   626|  44022|   3.0| 3.1494656|
|   772|   1645|   3.0| 2.9172883|
|   830|   1580|   5.0| 3.4807115|
|   844|   2122|   2.0| 2.3254826|
|   847|   1580|   3.0| 2.8064106|
|   847|   6620|   4.0|  3.725472|
|   847| 119432|   3.5| 3.1498907|
|   876|   1580|   3.5|  3.106748|
+------+-------+------+----------+
only showing top 20 rows



In [11]:
pred.select("rating", "prediction").describe().show()
# 통계값 추출 



+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4996136|           4996136|
|   mean|3.5346031413076027| 3.433834776379486|
| stddev|1.0603640353264783|0.6515956239543522|
|    min|               0.5|        -1.7539992|
|    max|               5.0|         6.7610817|
+-------+------------------+------------------+



                                                                                

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol = "rating", predictionCol="prediction")

In [15]:
rmse = evaluator.evaluate(pred)
print(rmse)
# 모델 성능 확인 



0.8104020615866961


                                                                                

In [16]:
model.recommendForAllUsers(3).show()
# 해당 함수를 통해 유저마다 3개의 recommendation을 보여줌 



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{200930, 5.52924...|
|    27|[{126941, 5.77172...|
|    28|[{194434, 6.94018...|
|    31|[{201821, 3.76500...|
|    34|[{112577, 6.77582...|
|    44|[{126941, 7.35727...|
|    53|[{192089, 6.46796...|
|    65|[{112577, 7.38221...|
|    76|[{200930, 5.82385...|
|    78|[{126941, 6.92072...|
|    81|[{200930, 4.48050...|
|    85|[{164809, 6.06455...|
|   101|[{200930, 5.09683...|
|   103|[{194434, 5.83162...|
|   108|[{194434, 5.43484...|
|   115|[{205277, 5.87427...|
|   126|[{194434, 6.12760...|
|   133|[{200930, 5.35752...|
|   137|[{201821, 5.48452...|
|   148|[{194434, 5.68710...|
+------+--------------------+
only showing top 20 rows



                                                                                

In [17]:
model.recommendForAllItems(3).show()
# 아이템마다 3개의 recommendation을 보여줌 



+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     26|[{87426, 5.257085...|
|     27|[{87426, 6.198086...|
|     28|[{33115, 5.719239...|
|     31|[{87426, 6.032798...|
|     34|[{7629, 5.4229746...|
|     44|[{87426, 5.356340...|
|     53|[{104135, 5.42763...|
|     65|[{87426, 5.253678...|
|     76|[{87426, 5.610274...|
|     78|[{87426, 5.204614...|
|     81|[{87426, 5.233549...|
|     85|[{87426, 5.176894...|
|    101|[{104135, 5.02584...|
|    103|[{87426, 5.867542...|
|    108|[{149507, 5.51742...|
|    115|[{33115, 5.979637...|
|    126|[{87426, 5.332704...|
|    133|[{87426, 5.296861...|
|    137|[{149507, 5.51090...|
|    148|[{149507, 4.17457...|
+-------+--------------------+
only showing top 20 rows



                                                                                

In [18]:
from pyspark.sql.types import IntegerType

user_list = [65,78,81]
users_df = spark.createDataFrame(user_list, IntegerType()).toDF("userID")

users_df.show()
# data frame으로 변환 

+------+
|userID|
+------+
|    65|
|    78|
|    81|
+------+



In [30]:
user_recs = model.recommendForUserSubset(users_df, 5)
# 각 유저마다 5개의 영화를 추천 
print(user_recs.show())
# 추천되는 movie의 id만 출력되는데 해당 id를 이름으로 변경 



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    65|[{112577, 7.38221...|
|    78|[{126941, 6.92072...|
|    81|[{200930, 4.48050...|
+------+--------------------+

None


                                                                                

In [29]:
movie_list = user_recs.collect()[0].recommendations

                                                                                

In [31]:
recs_df = spark.createDataFrame(movie_list)
recs_df.show()

+-------+-----------------+
|movieId|           rating|
+-------+-----------------+
| 112577|7.382209777832031|
| 144202|6.641467094421387|
|  98221|6.372282028198242|
| 120821|6.170456886291504|
| 200930|6.036976337432861|
+-------+-----------------+



In [32]:
movies_file = "/Users/sig6774/Desktop/Data_Engineering/data-engineering-main/01-spark/data/ml-25m/movies.csv"
movies_df = spark.read.csv(f"file:///{movies_file}", inferSchema=True, header=True)

In [33]:
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [34]:
# 추천 결과가 담긴 dataframe과 movei에 대한 정보가 담긴 dataframe을 sql로 사용하기 위해 tempview 생성 

recs_df.createOrReplaceTempView("recom")
movies_df.createOrReplaceTempView("movie")

In [35]:
# join을 통해서 추천 결과와 실제 movie 데이터를 함께 추출 

query = """
select *
from 
    movie join recom
    on movie.movieId = recom.movieId
order by
    rating desc
"""

recommended_movies = spark.sql(query)
recommended_movies.show()

+-------+--------------------+--------------------+-------+-----------------+
|movieId|               title|              genres|movieId|           rating|
+-------+--------------------+--------------------+-------+-----------------+
| 112577|Willie & Phil (1980)|Comedy|Drama|Romance| 112577|7.382209777832031|
| 144202|Catch That Girl (...|     Action|Children| 144202|6.641467094421387|
|  98221|Year One, The (L'...|              Comedy|  98221|6.372282028198242|
| 120821|The War at Home (...|     Documentary|War| 120821|6.170456886291504|
| 200930|C'est quoi la vie...|               Drama| 200930|6.036976337432861|
+-------+--------------------+--------------------+-------+-----------------+



In [37]:
# api를 사용해서 끝내도록 하나의 function안에 작성 

def get_recommendations(user_id, num_recs):
    # 유저의 아이디와 추천 개수를 입력 파라미터로 받음 
    user_df = spark.createDataFrame([user_id], IntegerType()).toDF("userId")
    user_recs_df = model.recommendForUserSubset(users_df, num_recs)

    recs_list = user_recs_df.collect()[0].recommendations
    recs_df = spark.createDataFrame(recs_list)

    query = """
        select *
        from 
            movie join recom
            on movie.movieId = recom.movieId
        order by
            rating desc
        """
    recommended_movies = spark.sql(query)
    return recommended_movies

In [38]:
recs = get_recommendations(456,10)

                                                                                

In [39]:
import pandas as pd 
recs.toPandas()

Unnamed: 0,movieId,title,genres,movieId.1,rating
0,112577,Willie & Phil (1980),Comedy|Drama|Romance,112577,7.38221
1,144202,Catch That Girl (2002),Action|Children,144202,6.641467
2,98221,"Year One, The (L'an 01) (1973)",Comedy,98221,6.372282
3,120821,The War at Home (1979),Documentary|War,120821,6.170457
4,200930,C'est quoi la vie? (1999),Drama,200930,6.036976


In [40]:
spark.stop()