In [11]:
from pyspark.sql import SparkSession

MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("movie-recommendation")\
    .config("spark.executor.memory", MAX_MEMORY)\
    .config("spark.driver.memory", MAX_MEMORY)\
    .getOrCreate()

In [12]:
ratings_file = "/Users/robertmin/PycharmProjects/study/data_engineering/spark_review/data/ratings.csv"
ratings_df = spark.read.csv(f"file:///{ratings_file}", inferSchema=True, header=True)

                                                                                

In [13]:
ratings_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [14]:
ratings_df = ratings_df.select(['userID', 'movieID', 'rating'])

In [15]:
ratings_df.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- movieID: integer (nullable = true)
 |-- rating: double (nullable = true)



In [16]:
ratings_df.describe().show()



+-------+-----------------+------------------+------------------+
|summary|           userID|           movieID|            rating|
+-------+-----------------+------------------+------------------+
|  count|         25000095|          25000095|          25000095|
|   mean|81189.28115381162|21387.981943268616| 3.533854451353085|
| stddev|46791.71589745776| 39198.86210105973|1.0607439611423535|
|    min|                1|                 1|               0.5|
|    max|           162541|            209171|               5.0|
+-------+-----------------+------------------+------------------+



                                                                                

In [17]:
train_df, test_df = ratings_df.randomSplit([0.8, 0.2])

In [18]:
# ALS 추천알고리즘
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol="userID",
    itemCol="movieID",
    ratingCol="rating",
    coldStartStrategy="drop"
)

In [19]:
# 메모리 부족 오류 WARN MemoryStore: Not enough space to cache rdd_53_0 in memory!
model = als.fit(train_df)

                                                                                

In [20]:
predictions = model.transform(test_df)

In [21]:
predictions.show()



+------+-------+------+----------+
|userID|movieID|rating|prediction|
+------+-------+------+----------+
|     1|    296|   5.0| 4.1006746|
|     1|    307|   5.0| 3.9204323|
|     1|    899|   3.5| 3.8612802|
|     1|   1217|   3.5| 4.0038295|
|     1|   2161|   3.5| 3.2212622|
|     1|   2573|   4.0| 3.2377796|
|     1|   3949|   5.0| 3.8406384|
|     1|   4703|   4.0|  3.531045|
|     1|   5147|   4.0| 3.9694772|
|     1|   5767|   5.0|  4.007679|
|     1|   7234|   4.5| 3.9388943|
|     1|   7820|   2.5| 3.9094255|
|     1|   7939|   2.5| 3.8341415|
|     1|   8154|   5.0| 3.8385732|
|     1|   8327|   5.0| 3.7486026|
|     1|   8685|   1.0| 2.9600396|
|    12|     16|   5.0| 3.5984116|
|    12|     22|   3.0| 2.8334186|
|    12|     27|   2.0| 2.7482634|
|    12|     29|   4.0|  3.772168|
+------+-------+------+----------+
only showing top 20 rows



                                                                                

In [22]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol='rating', predictionCol='prediction')

In [23]:
rmse = evaluator.evaluate(predictions)
print(rmse)

[Stage 176:>                                                        (0 + 4) / 4]

0.813505148188785


                                                                                

In [25]:
model.recommendForAllUsers(2).show()

[Stage 205:>                                                        (0 + 1) / 1]

+------+--------------------+
|userID|     recommendations|
+------+--------------------+
|     1|[{194434, 5.57926...|
|     6|[{136892, 6.77530...|
|    12|[{194434, 5.61251...|
|    13|[{136892, 6.59139...|
|    16|[{194434, 6.50861...|
|    22|[{199187, 6.72055...|
|    26|[{203086, 6.02279...|
|    27|[{203086, 6.10824...|
|    28|[{194434, 7.73286...|
|    31|[{197927, 4.02799...|
|    34|[{194434, 5.74480...|
|    44|[{194434, 7.12445...|
|    47|[{138580, 5.35425...|
|    52|[{203086, 6.17203...|
|    53|[{194334, 6.99273...|
|    65|[{205453, 6.73962...|
|    76|[{194434, 6.21034...|
|    78|[{194434, 6.84309...|
|    81|[{179707, 5.16069...|
|    85|[{98693, 5.929706...|
+------+--------------------+
only showing top 20 rows



                                                                                

## 특정 유저에게 추천

In [26]:
from pyspark.sql.types import IntegerType

user_list = [63, 88, 91]
user_df = spark.createDataFrame(user_list, IntegerType()).toDF("userId")
user_df.show()

[Stage 206:>                                                        (0 + 1) / 1]

+------+
|userId|
+------+
|    63|
|    88|
|    91|
+------+



                                                                                

In [27]:
user_resc = model.recommendForUserSubset(user_df, 5)

In [28]:
movie_list = user_resc.collect()[0].recommendations

In [29]:
resc_df = spark.createDataFrame(movie_list)
resc_df.show()

+-------+-----------------+
|movieID|           rating|
+-------+-----------------+
| 203086|6.018644332885742|
| 203882| 5.52311897277832|
| 107252|5.359184741973877|
| 194434|5.153403282165527|
| 190163|5.151778221130371|
+-------+-----------------+



In [30]:
# 영화 이름
movies_file = "/Users/robertmin/PycharmProjects/study/data_engineering/spark_review/data/movies.csv"
movies_df = spark.read.csv(f"file:///{movies_file}", inferSchema=True, header=True)

                                                                                

In [None]:
movies_df.show()

In [32]:
resc_df.createOrReplaceTempView("recommendations")
movies_df.createOrReplaceTempView("movies")

In [34]:
query = """
SELECT
    *
FROM
    movies JOIN recommendations
    ON movies.movieId = recommendations.movieId
ORDER BY
    rating desc
"""
recommended_movies = spark.sql(query)
recommended_movies.show()

                                                                                

+-------+--------------------+------------------+-------+-----------------+
|movieId|               title|            genres|movieID|           rating|
+-------+--------------------+------------------+-------+-----------------+
| 203086|Truth and Justice...|             Drama| 203086|6.018644332885742|
| 203882|Dead in the Water...|            Horror| 203882| 5.52311897277832|
| 107252|Island at War (2004)|         Drama|War| 107252|5.359184741973877|
| 194434|   Adrenaline (1990)|(no genres listed)| 194434|5.153403282165527|
| 190163|Leaning Towards S...|(no genres listed)| 190163|5.151778221130371|
+-------+--------------------+------------------+-------+-----------------+



In [43]:
def get_recommendations(user_id, num_recs):
    user_df = spark.createDataFrame([user_id], IntegerType()).toDF('userId')
    user_resc_df = model.recommendForUserSubset(user_df, num_recs)

    recs_list = user_resc_df.collect()[0].recommendations
    recs_df = spark.createDataFrame(recs_list)

    recs_df.createOrReplaceTempView("recommendations")

    query = """
    SELECT
        *
    FROM
        movies JOIN recommendations
        ON movies.movieId = recommendations.movieId
    ORDER BY
        rating desc
    """

    recommended_movies = spark.sql(query)
    return recommended_movies



In [44]:
recs = get_recommendations(456, 10)

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)
                                                                                

In [45]:
recs.toPandas()

Unnamed: 0,movieId,title,genres,movieID,rating
0,203086,Truth and Justice (2019),Drama,203086,7.047431
1,199187,Hoaxed (2019),(no genres listed),199187,7.008523
2,203882,Dead in the Water (2006),Horror,203882,6.831636
3,194434,Adrenaline (1990),(no genres listed),194434,6.467854
4,107252,Island at War (2004),Drama|War,107252,6.406742
5,144202,Catch That Girl (2002),Action|Children,144202,6.312358
6,205453,The Good Fight: The Abraham Lincoln Brigade in...,Documentary,205453,6.236917
7,151615,Hello Stranger (2010),Drama,151615,6.218982
8,117352,A Kind of America 2 (2008),Comedy,117352,6.212343
9,190163,Leaning Towards Solace (2012),(no genres listed),190163,6.158483
