<a href="https://colab.research.google.com/github/reiple/ds2_2023_spark/blob/main/07.%20Recommendation_skel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!sudo apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import findspark
findspark.init("/content/spark-3.2.4-bin-hadoop3.2")

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [6]:
from google.colab import drive
drive.mount('/content/drive')
gpath = '/content/drive/MyDrive/dataset/spark/data07/'

Mounted at /content/drive


In [7]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

colNames = ["movieId", "title", "genres"]

movies_schema = StructType()
for name in colNames:
    if name == "movieId":
        movies_schema.add(StructField(name, IntegerType(), True))
    else:
        movies_schema.add(StructField(name, StringType(), True))

movies = spark.read.csv(gpath+"movies.csv", header=True, schema=movies_schema)
movies.take(1)

[Row(movieId=1, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy')]

In [None]:
movies.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [8]:
movies_pd = movies.toPandas().set_index("movieId")

In [9]:
movies_pd[movies_pd['title'].str.contains("Iron Man")]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
59315,Iron Man (2008),Action|Adventure|Sci-Fi
77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX
102007,"Invincible Iron Man, The (2007)",Animation
102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX
142056,Iron Man & Hulk: Heroes United (2013),Action|Adventure|Animation
167296,Iron Man (1931),Drama


In [10]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

colNames = ["userId", "movieId", "rating", "timestamp"]

ratings_schema = StructType()
for name in colNames:
    if name == "rating":
        ratings_schema.add(StructField(name, DoubleType(), True))
    else:
        ratings_schema.add(StructField(name, IntegerType(), True))

ratings = spark.read.csv(gpath+"ratings.csv", header=True, schema=ratings_schema)

ratings.take(1)

[Row(userId=1, movieId=1, rating=4.0, timestamp=964982703)]

In [11]:
(trainData, testData) = ratings.randomSplit([0.8, 0.2])

In [12]:
from pyspark.ml.recommendation import ALS
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(trainData)

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator
predictions = model.transform(testData)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8840754763887899


In [14]:
model.userFactors.orderBy("id").show(truncate=False)



+---+--------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                        |
+---+--------------------------------------------------------------------------------------------------------------------------------+
|1  |[-0.29205742, 0.34915766, -0.6036778, -0.06583757, 0.40565205, -1.2432356, -0.24978857, -1.1309153, -1.0844849, -0.539725]      |
|2  |[-0.24197358, 0.62042075, -0.22086847, 0.2201505, 0.9026202, -0.941432, -0.27370158, -0.36725584, -0.98535264, -0.6967393]      |
|3  |[-0.66076666, -0.49755394, -1.0001291, -0.15619631, -1.3842918, 0.023976957, -0.56651264, -0.13730843, -0.7969261, -0.37650508] |
|4  |[-0.46495432, -0.014777346, -0.27764076, 0.7400582, -0.3336163, -0.983748, 0.9495335, -1.1785202, -0.9520781, -0.357894]        |
|5  |[-0.3616469, 0.43126366, -0.60394454, 0.19393268, 

In [16]:
model.itemFactors.orderBy("id").show(truncate=False)

+---+-------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                       |
+---+-------------------------------------------------------------------------------------------------------------------------------+
|1  |[-0.21842325, 0.7451113, -0.5239817, -0.33095688, 0.64533126, -0.79889935, 0.3627885, -0.92462224, -1.4766064, -0.31170443]    |
|2  |[-0.37627935, 0.72043234, -0.26333985, -0.5165242, 0.23764294, -0.76305556, -0.18568738, -0.76264614, -1.2970676, -0.40425706] |
|3  |[-0.46214336, 0.3618713, -0.22174233, -0.061978243, 0.53842604, -0.1355299, -0.4121083, -0.9656642, -1.300933, -0.6196537]     |
|4  |[-0.19115472, 0.16835125, 0.0345286, -0.20277996, 0.01844377, 0.11560086, 0.036021303, -0.6700257, -1.2839785, -0.24854153]    |
|5  |[-0.28780246, 0.784582, -0.28475603, -0.4524013, 0.089340

In [17]:
# who is the most movie lover
from collections import Counter
user_list = [int(row['userId']) for row in ratings.collect()]
c = Counter(user_list)
print(c.most_common(5))

[(414, 2698), (599, 2478), (474, 2108), (448, 1864), (274, 1346)]


In [18]:
# recommend for one user
targetUserID = c.most_common()[0][0]
userWatched = ratings.where(ratings.userId == targetUserID)
userWatchedPD = userWatched.toPandas()
userWatchedPD['title'] = userWatchedPD['movieId'].apply(lambda x: movies_pd.loc[x].title)
userWatchedPD

Unnamed: 0,userId,movieId,rating,timestamp,title
0,414,1,4.0,961438127,Toy Story (1995)
1,414,2,3.0,961594981,Jumanji (1995)
2,414,3,4.0,961439278,Grumpier Old Men (1995)
3,414,5,2.0,961437647,Father of the Bride Part II (1995)
4,414,6,3.0,961515642,Heat (1995)
...,...,...,...,...,...
2693,414,180045,4.0,1515207301,Molly's Game (2017)
2694,414,180497,4.0,1525548614,The Post (2017)
2695,414,180985,3.5,1527978072,The Greatest Showman (2017)
2696,414,184791,2.5,1519592410,Fred Armisen: Standup for Drummers (2018)


In [19]:
userWatched.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|   414|      1|   4.0| 961438127|
|   414|      2|   3.0| 961594981|
|   414|      3|   4.0| 961439278|
|   414|      5|   2.0| 961437647|
|   414|      6|   3.0| 961515642|
|   414|      7|   3.0| 961439170|
|   414|      8|   3.0| 961594849|
|   414|     10|   3.0| 961515863|
|   414|     11|   5.0|1052148205|
|   414|     15|   2.0| 961514611|
|   414|     16|   3.0| 961517557|
|   414|     17|   4.0| 961513829|
|   414|     18|   3.0| 961682128|
|   414|     21|   4.0| 961438199|
|   414|     22|   3.0| 961518227|
|   414|     23|   2.0| 961682276|
|   414|     24|   3.0| 961436964|
|   414|     25|   3.0| 961517140|
|   414|     27|   2.0| 961518812|
|   414|     31|   3.0| 961518520|
+------+-------+------+----------+
only showing top 20 rows




# 새로운 사용자에게 영화 추천

In [20]:
user = userWatched.distinct()
userSubsetRecs = model.recommendForUserSubset(user, 10).toPandas()
userSubsetRecs['title'] = userSubsetRecs['recommendations'].apply(lambda x: [movies_pd.loc[i[0]].title for i in x])
for item in userSubsetRecs.title:
    print(item)



['Neon Genesis Evangelion: Death & Rebirth (Shin seiki Evangelion Gekijô-ban: Shito shinsei) (1997)', 'On the Beach (1959)', 'Saving Face (2004)', 'Cherish (2002)', 'Rain (2001)', 'Belle époque (1992)', 'Topo, El (1970)', 'Pierrot le fou (1965)', 'Wild Parrots of Telegraph Hill, The (2003)', 'Chungking Express (Chung Hing sam lam) (1994)']


In [21]:
users = ratings.select(als.getUserCol()).distinct().limit(3)
users.show()

+------+
|userId|
+------+
|   148|
|   463|
|   471|
+------+



In [22]:
# Generate top 5 movie recommendations for a set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 5)
userSubsetRecs.show(truncate=False)



+------+-----------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                |
+------+-----------------------------------------------------------------------------------------------+
|471   |[{89904, 4.9865155}, {1411, 4.9009767}, {68945, 4.808493}, {3379, 4.808493}, {49932, 4.625464}]|
|463   |[{33649, 5.0806723}, {945, 4.935425}, {69524, 4.735538}, {78836, 4.7312837}, {7071, 4.72177}]  |
|148   |[{720, 4.8241024}, {33649, 4.63185}, {98491, 4.4597573}, {160718, 4.385815}, {928, 4.3759665}] |
+------+-----------------------------------------------------------------------------------------------+



In [23]:
# Generate top 5 user recommendations for a set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 5)
movieSubSetRecs.show(truncate=False)

+-------+---------------------------------------------------------------------------------------+
|movieId|recommendations                                                                        |
+-------+---------------------------------------------------------------------------------------+
|1580   |[{53, 4.917813}, {543, 4.8733044}, {276, 4.6668625}, {492, 4.612564}, {243, 4.5983534}]|
|3175   |[{53, 4.89008}, {543, 4.502476}, {276, 4.4760995}, {452, 4.465859}, {93, 4.452167}]    |
|2366   |[{423, 5.036814}, {35, 4.8221164}, {206, 4.7859344}, {43, 4.6917186}, {99, 4.636307}]  |
+-------+---------------------------------------------------------------------------------------+



In [24]:
%%time
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{68945, 5.763310...|
|     3|[{70946, 5.116101...|
|     5|[{3089, 5.1034756...|
|     6|[{33649, 5.210644...|
|     9|[{6380, 5.2470164...|
|    12|[{77846, 6.226051...|
|    13|[{161582, 5.36127...|
|    15|[{3742, 4.8574586...|
|    16|[{68945, 4.528044...|
|    17|[{33649, 5.221736...|
|    19|[{1949, 4.14611},...|
|    20|[{1262, 5.42218},...|
|    22|[{3223, 5.2736874...|
|    26|[{68945, 4.57121}...|
|    27|[{33649, 4.963053...|
|    28|[{7842, 4.522045}...|
|    31|[{3404, 5.308219}...|
|    34|[{7842, 5.5971084...|
|    35|[{3404, 5.5153317...|
|    37|[{86345, 5.617925...|
+------+--------------------+
only showing top 20 rows

CPU times: user 21.1 ms, sys: 979 µs, total: 22.1 ms
Wall time: 3.87 s


In [25]:
%%time
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|      1|[{53, 5.399724}, ...|
|      3|[{53, 4.6850886},...|
|      5|[{53, 4.3066616},...|
|      6|[{53, 5.468882}, ...|
|      9|[{543, 4.694057},...|
|     12|[{543, 4.3111777}...|
|     13|[{43, 4.038104}, ...|
|     15|[{543, 5.1243367}...|
|     16|[{53, 5.26572}, {...|
|     17|[{206, 4.8083835}...|
|     19|[{53, 4.247048}, ...|
|     20|[{12, 4.047288}, ...|
|     22|[{53, 4.677027}, ...|
|     26|[{53, 4.638265}, ...|
|     27|[{12, 4.287132}, ...|
|     28|[{12, 5.417323}, ...|
|     31|[{43, 4.913698}, ...|
|     34|[{99, 4.884393}, ...|
|     40|[{544, 4.926397},...|
|     41|[{191, 4.6762753}...|
+-------+--------------------+
only showing top 20 rows

CPU times: user 32.7 ms, sys: 9.5 ms, total: 42.2 ms
Wall time: 9.07 s


### **도전**

In [30]:
from pyspark.sql import Row
customUserData = sc.parallelize(
    [Row(99999999, 59315, 5, 123456789),
     Row(99999999, 77561, 4, 123456789),
     Row(99999999, 102125, 4, 123456789),
     Row(99999999, 190297, 1, 123456789),
     Row(99999999, 155384, 2, 123456789),
     Row(99999999, 170763, 1, 123456789),
]
).toDF(["userId", "movieId", "rating", "timestamp"])

ratings_with_userData = ratings.union(customUserData)

from pyspark.ml.recommendation import ALS

als = ALS(rank=50, regParam=0.05, maxIter=10,
          userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(ratings_with_userData)

In [32]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
userSubsetRecs = model.recommendForUserSubset(customUserData.select("userId").distinct(), 10).toPandas()
userSubsetRecs['title'] = userSubsetRecs['recommendations'].apply(lambda x: [movies_pd.loc[i[0]].title for i in x])
userSubsetRecs



Unnamed: 0,userId,recommendations,title
0,99999999,"[(59315, 4.878103256225586), (2571, 4.55980920791626), (58559, 4.5289716720581055), (34405, 4.464439392089844), (89745, 4.455581188201904), (1198, 4.381580352783203), (1210, 4.371745586395264), (1196, 4.364892959594727), (1291, 4.363119125366211), (57669, 4.33685827255249)]","[Iron Man (2008), Matrix, The (1999), Dark Knight, The (2008), Serenity (2005), Avengers, The (2012), Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), Star Wars: Episode VI - Return of the Jedi (1983), Star Wars: Episode V - The Empire Strikes Back (1980), Indiana Jones and the Last Crusade (1989), In Bruges (2008)]"
