In [None]:
#!sudo apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import findspark
findspark.init("/content/spark-3.2.4-bin-hadoop3.2")

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [None]:
from google.colab import drive
drive.mount('/gdrive')
gpath = '/gdrive/MyDrive/data/'

Mounted at /content/drive


In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

colNames = ["movieId", "title", "genres"]

movies_schema = StructType()
for name in colNames:
    if name == "movieId":
        movies_schema.add(StructField(name, IntegerType(), True))
    else:
        movies_schema.add(StructField(name, StringType(), True))

movies = spark.read.csv(gpath+"movies.csv", header=True, schema=movies_schema)
movies.take(1)

[Row(movieId=1, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy')]

In [None]:
movies.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [None]:
movies_pd = movies.toPandas().set_index("movieId")

In [None]:
movies_pd[movies_pd['title'].str.contains("Iron Man")]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
59315,Iron Man (2008),Action|Adventure|Sci-Fi
77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX
102007,"Invincible Iron Man, The (2007)",Animation
102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX
142056,Iron Man & Hulk: Heroes United (2013),Action|Adventure|Animation
167296,Iron Man (1931),Drama


In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

colNames = ["userId", "movieId", "rating", "timestamp"]

ratings_schema = StructType()
for name in colNames:
    if name == "rating":
        ratings_schema.add(StructField(name, DoubleType(), True))
    else:
        ratings_schema.add(StructField(name, IntegerType(), True))

ratings = spark.read.csv(gpath+"ratings.csv", header=True, schema=ratings_schema)

ratings.take(1)

[Row(userId=1, movieId=1, rating=4.0, timestamp=964982703)]

In [None]:
(trainData, testData) = ratings.randomSplit([0.8, 0.2])

In [None]:
from pyspark.ml.recommendation import ALS
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(trainData)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
predictions = model.transform(testData)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8915410004900678


In [None]:
model.userFactors.orderBy("id").show(truncate=False)

+---+------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                      |
+---+------------------------------------------------------------------------------------------------------------------------------+
|1  |[0.026658675, 0.050094772, 0.34203038, -1.0277818, -1.1790694, 0.019153621, -0.55622655, -0.18103118, -0.15816529, -1.5311037]|
|2  |[-0.13180351, -0.21119423, 1.0054077, -1.0463094, -0.583893, 0.2150482, -0.59344554, -0.49889863, -0.47876757, -0.95016825]   |
|3  |[0.39621675, -1.4059039, 0.35690656, 0.11502634, -0.14439072, 0.16035815, -0.54317135, 0.999428, -0.9490114, -0.8396593]      |
|4  |[-0.5947036, 0.9239907, 0.1736886, -1.3425721, 0.2711473, 0.09131307, -0.9941009, -0.18231696, 0.08523861, -0.9528297]        |
|5  |[-0.6887873, 0.11472649, 0.3334479, -1.3033222, -0.76325756, 0.3

In [None]:
model.itemFactors.orderBy("id").show(truncate=False)

+---+-------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                       |
+---+-------------------------------------------------------------------------------------------------------------------------------+
|1  |[-0.6834658, 0.24385622, 0.24378018, -1.199012, -0.8763653, 0.035134513, -0.5130618, -0.6641814, -0.4561035, -1.1935326]       |
|2  |[-0.17787144, 0.06916605, 0.37821952, -1.0957385, -0.97333443, -0.4602979, -0.2643417, -0.37301308, -0.5525413, -0.9706406]    |
|3  |[0.6463989, 0.27572334, 0.32561952, -1.1257715, -0.76067644, -0.5474669, -0.5654235, -0.2924987, -0.30782652, -0.95507586]     |
|4  |[-0.24829096, -0.120367736, 0.18319902, -0.7773587, -0.4407086, -0.43577603, -0.14062029, -0.43735602, -0.72162634, -0.5143414]|
|5  |[-0.10184942, 0.21559429, 0.63349867, -0.98673403, -0.690

In [None]:
# who is the most movie lover
from collections import Counter
user_list = [int(row['userId']) for row in ratings.collect()]
c = Counter(user_list)
print(c.most_common(5))

[(414, 2698), (599, 2478), (474, 2108), (448, 1864), (274, 1346)]


In [None]:
# recommend for one user
targetUserID = c.most_common()[0][0]
userWatched = ratings.where(ratings.userId == targetUserID)
userWatchedPD = userWatched.toPandas()
userWatchedPD['title'] = userWatchedPD['movieId'].apply(lambda x: movies_pd.loc[x].title)
userWatchedPD

Unnamed: 0,userId,movieId,rating,timestamp,title
0,414,1,4.0,961438127,Toy Story (1995)
1,414,2,3.0,961594981,Jumanji (1995)
2,414,3,4.0,961439278,Grumpier Old Men (1995)
3,414,5,2.0,961437647,Father of the Bride Part II (1995)
4,414,6,3.0,961515642,Heat (1995)
...,...,...,...,...,...
2693,414,180045,4.0,1515207301,Molly's Game (2017)
2694,414,180497,4.0,1525548614,The Post (2017)
2695,414,180985,3.5,1527978072,The Greatest Showman (2017)
2696,414,184791,2.5,1519592410,Fred Armisen: Standup for Drummers (2018)


In [None]:
userWatched.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|   414|      1|   4.0| 961438127|
|   414|      2|   3.0| 961594981|
|   414|      3|   4.0| 961439278|
|   414|      5|   2.0| 961437647|
|   414|      6|   3.0| 961515642|
|   414|      7|   3.0| 961439170|
|   414|      8|   3.0| 961594849|
|   414|     10|   3.0| 961515863|
|   414|     11|   5.0|1052148205|
|   414|     15|   2.0| 961514611|
|   414|     16|   3.0| 961517557|
|   414|     17|   4.0| 961513829|
|   414|     18|   3.0| 961682128|
|   414|     21|   4.0| 961438199|
|   414|     22|   3.0| 961518227|
|   414|     23|   2.0| 961682276|
|   414|     24|   3.0| 961436964|
|   414|     25|   3.0| 961517140|
|   414|     27|   2.0| 961518812|
|   414|     31|   3.0| 961518520|
+------+-------+------+----------+
only showing top 20 rows



In [None]:
user = userWatched.distinct()
userSubsetRecs = model.recommendForUserSubset(user, 10).toPandas()
userSubsetRecs['title'] = userSubsetRecs['recommendations'].apply(lambda x: [movies_pd.loc[i[0]].title for i in x])
for item in userSubsetRecs.title:
    print(item)

['Jetée, La (1962)', 'Dragon Ball Z: The History of Trunks (Doragon bôru Z: Zetsubô e no hankô!! Nokosareta chô senshi - Gohan to Torankusu) (1993)', 'On the Beach (1959)', 'Man Who Shot Liberty Valance, The (1962)', 'Belle époque (1992)', 'Cherish (2002)', 'Two Family House (2000)', 'Rain (2001)', 'Man for All Seasons, A (1966)', 'Wallace & Gromit: The Best of Aardman Animation (1996)']


In [None]:
users = ratings.select(als.getUserCol()).distinct().limit(3)
users.show()

+------+
|userId|
+------+
|   148|
|   463|
|   471|
+------+



In [None]:
# Generate top 5 movie recommendations for a set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 5)
userSubsetRecs.show(truncate=False)

+------+------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                 |
+------+------------------------------------------------------------------------------------------------+
|471   |[{8477, 4.952992}, {89904, 4.6396623}, {148881, 4.617899}, {176371, 4.599914}, {6818, 4.59152}] |
|463   |[{96004, 5.025163}, {3379, 5.025163}, {33649, 4.889099}, {86377, 4.8694377}, {1232, 4.759979}]  |
|148   |[{51931, 4.7387958}, {8477, 4.5932293}, {112804, 4.5931144}, {33649, 4.522895}, {7121, 4.52123}]|
+------+------------------------------------------------------------------------------------------------+



In [None]:
# Generate top 5 user recommendations for a set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 5)
movieSubSetRecs.show(truncate=False)

+-------+---------------------------------------------------------------------------------------+
|movieId|recommendations                                                                        |
+-------+---------------------------------------------------------------------------------------+
|1580   |[{53, 5.033257}, {543, 4.612631}, {452, 4.548948}, {584, 4.514587}, {93, 4.4775667}]   |
|3175   |[{53, 4.9551845}, {154, 4.587667}, {93, 4.5635114}, {171, 4.5138845}, {276, 4.5113344}]|
|2366   |[{548, 4.837672}, {375, 4.640376}, {244, 4.480765}, {25, 4.4659805}, {201, 4.465281}]  |
+-------+---------------------------------------------------------------------------------------+



In [None]:
%%time
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[{8477, 4.952992}...|
|   463|[{96004, 5.025163...|
|   496|[{26171, 4.316339...|
|   148|[{51931, 4.738795...|
|   540|[{96004, 5.517078...|
|   392|[{1218, 5.1996703...|
|   243|[{32892, 5.971028...|
|    31|[{92643, 5.35793}...|
|   516|[{132333, 4.58233...|
|   580|[{86377, 5.076369...|
|   251|[{96004, 5.513946...|
|   451|[{1411, 5.395527}...|
|    85|[{5034, 5.717403}...|
|   137|[{8477, 5.06932},...|
|    65|[{51931, 4.868758...|
|   458|[{33649, 5.39336}...|
|   481|[{3067, 4.15787},...|
|    53|[{33649, 7.044965...|
|   255|[{4102, 4.6839924...|
|   588|[{96004, 4.822596...|
+------+--------------------+
only showing top 20 rows

CPU times: user 30.9 ms, sys: 2.6 ms, total: 33.5 ms
Wall time: 4.69 s


In [None]:
%%time
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[{53, 5.033257}, ...|
|   4900|[{236, 5.091975},...|
|   5300|[{597, 4.0000997}...|
|   6620|[{518, 5.17587}, ...|
|   7340|[{43, 4.1122465},...|
|  32460|[{452, 5.053531},...|
|  54190|[{53, 5.3884277},...|
|    471|[{536, 4.8125086}...|
|   1591|[{112, 4.5880437}...|
| 140541|[{12, 4.048854}, ...|
|   1342|[{108, 3.6414864}...|
|   2122|[{537, 4.2193813}...|
|   2142|[{538, 4.0562096}...|
|   7982|[{53, 5.517194}, ...|
|  44022|[{53, 4.639922}, ...|
| 141422|[{548, 2.9230604}...|
| 144522|[{53, 2.9531963},...|
|    833|[{43, 4.605943}, ...|
|   5803|[{388, 3.6647263}...|
|   7833|[{154, 4.014318},...|
+-------+--------------------+
only showing top 20 rows

CPU times: user 56.7 ms, sys: 6.86 ms, total: 63.5 ms
Wall time: 10.3 s


### **도전**

In [None]:
from pyspark.sql import Row
customUserData = sc."fill here"(
    [Row(99999999, 59315, 5, 123456789),
     Row(99999999, 77561, 4, 123456789),
     Row(99999999, 102125, 4, 123456789),
     Row(99999999, 190297, 1, 123456789),
     Row(99999999, 155384, 2, 123456789),
     Row(99999999, 170763, 1, 123456789),
]
)."fill here"(["userId", "movieId", "rating", "timestamp"])

ratings_with_userData = ratings."fill here"("fill here")

from pyspark.ml.recommendation import ALS

als = "fill here"(rank=50, regParam=0.05, maxIter=10,
          "fill here"="userId", "fill here"="movieId", "fill here"="rating", "fill here"="drop")
model = als."fill here"(ratings_with_userData)

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
userSubsetRecs = model."fill here"(customUserData.select("userId").distinct(), 10).toPandas()
userSubsetRecs['title'] = userSubsetRecs['recommendations'].apply(lambda x: [movies_pd.loc[i[0]].title for i in x])
userSubsetRecs

Unnamed: 0,userId,recommendations,title
0,99999999,"[(59315, 4.871374130249023), (58559, 4.549170970916748), (2571, 4.507725715637207), (89745, 4.506622791290283), (1196, 4.376098155975342), (57669, 4.36331844329834), (78499, 4.363014221191406), (112852, 4.3263840675354), (60069, 4.314029693603516), (1266, 4.308404445648193)]","[Iron Man (2008), Dark Knight, The (2008), Matrix, The (1999), Avengers, The (2012), Star Wars: Episode V - The Empire Strikes Back (1980), In Bruges (2008), Toy Story 3 (2010), Guardians of the Galaxy (2014), WALL·E (2008), Unforgiven (1992)]"
