# "[Spark] PySpark 추천시스템 모델"
> pyspark 추천시스템 모델

- toc: true 
- badges: true
- comments: true
- categories: [Spark]
- tags: [spark, pyspark, recommendation]

In [1]:
import os
MINIO_ACCESS_KEY = os.environ['MINIO_ACCESS_KEY']
MINIO_SECRET_KEY = os.environ['MINIO_SECRET_KEY']

spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.access.key", MINIO_ACCESS_KEY)
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.secret.key", MINIO_SECRET_KEY)
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.endpoint", "http://lab101:10170")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.connection.ssl.enabled", "false")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.path.style.access", "true")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("com.amazonaws.services.s3.enableV2", "true")
spark.sparkContext._jsc.hadoopConfiguration()\
    .set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

In [9]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

ratings = spark.read.text("s3a://data/sample_movielens_ratings.txt")\
    .rdd.toDF()\
    .selectExpr("split(value, '::') as col")\
    .selectExpr(
        "cast(col[0] as int) as userId",
        "cast(col[1] as int) as movieId",
        "cast(col[2] as int) as rating",
        "cast(col[3] as long) as timestamp"
    )

In [10]:
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|     3|1424380312|
|     0|      3|     1|1424380312|
|     0|      5|     2|1424380312|
|     0|      9|     4|1424380312|
|     0|     11|     1|1424380312|
|     0|     12|     2|1424380312|
|     0|     15|     1|1424380312|
|     0|     17|     1|1424380312|
|     0|     19|     1|1424380312|
|     0|     21|     1|1424380312|
|     0|     23|     1|1424380312|
|     0|     26|     3|1424380312|
|     0|     27|     1|1424380312|
|     0|     28|     1|1424380312|
|     0|     29|     1|1424380312|
|     0|     30|     1|1424380312|
|     0|     31|     1|1424380312|
|     0|     34|     1|1424380312|
|     0|     37|     1|1424380312|
|     0|     41|     2|1424380312|
+------+-------+------+----------+
only showing top 20 rows



In [11]:
training, test = ratings.randomSplit([0.8, 0.2])
als = ALS()\
    .setMaxIter(5)\
    .setRegParam(0.01)\
    .setUserCol("userId")\
    .setItemCol("movieId")\
    .setRatingCol("rating")
print(als.explainParams())

alpha: alpha for implicit preference (default: 1.0)
blockSize: block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data. (default: 4096)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
coldStartStrategy: strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: 'nan', 'drop'. (default: nan)
finalStorageLevel: StorageLevel for ALS model factors. (default: MEMORY_AND_DISK)
implicitPrefs: whether to use implicit preference (default: False)
intermediateStorageLevel: StorageLevel for interme

In [12]:
alsModel = als.fit(training)
predictions = alsModel.transform(test)

In [16]:
alsModel.recommendForAllUsers(10)\
    .selectExpr("userId", "explode(recommendations)").show()

+------+---------------+
|userId|            col|
+------+---------------+
|    20| {22, 4.846066}|
|    20| {18, 4.623545}|
|    20|{75, 4.0272512}|
|    20|{27, 4.0242887}|
|    20| {77, 3.781201}|
|    20| {62, 3.671545}|
|    20|{36, 3.2965648}|
|    20|{74, 3.2775805}|
|    20| {80, 3.175223}|
|    20|{94, 3.1339927}|
|    10| {74, 4.163344}|
|    10|{87, 4.0253325}|
|    10|  {2, 3.832425}|
|    10|{53, 3.7631018}|
|    10| {40, 3.762528}|
|    10|{70, 3.1891506}|
|    10| {59, 3.128613}|
|    10|{42, 3.0709763}|
|    10|{32, 3.0483675}|
|    10|{49, 3.0007033}|
+------+---------------+
only showing top 20 rows



In [18]:
alsModel.recommendForAllItems(10)\
    .selectExpr("movieId", "explode(recommendations)").show()

+-------+---------------+
|movieId|            col|
+-------+---------------+
|     20| {17, 4.662134}|
|     20| {22, 4.212014}|
|     20| {9, 3.8890233}|
|     20|{23, 3.8233962}|
|     20|{12, 3.7794924}|
|     20|{29, 3.4634056}|
|     20| {5, 3.2149444}|
|     20|{10, 2.4653668}|
|     20|{24, 2.3592012}|
|     20| {2, 2.0555623}|
|     40|{16, 4.2417116}|
|     40|{19, 3.9465256}|
|     40| {8, 3.9215052}|
|     40| {2, 3.9037194}|
|     40| {10, 3.762528}|
|     40|{21, 3.5958273}|
|     40|  {9, 3.067248}|
|     40| {5, 3.0099266}|
|     40| {4, 2.7588284}|
|     40| {23, 2.628215}|
+-------+---------------+
only showing top 20 rows



# 평가기

In [21]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("rating")\
    .setPredictionCol("prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = %f" % rmse)

Root-mean-square error = 1.899920


# 성과 평가지표

### 회귀 평가지표

In [37]:
from pyspark.mllib.evaluation import RegressionMetrics

regComparison = predictions.select("rating", "prediction")\
    .rdd.map(lambda x: (x(0), x(1)))
metrics = RegressionMetrics(regComparison)

## 순위 평가지표

In [39]:
from pyspark.mllib.evaluation import RankingMetrics, RegressionMetrics
from pyspark.sql.functions import col, expr

perUserActual = predictions\
    .where("rating > 2.5")\
    .groupby("userId")\
    .agg(expr("collect_set(movieId) as movies"))

In [40]:
perUserPredictions = predictions\
    .orderBy(col("userId"), expr("prediction DESC"))\
    .groupby("userId")\
    .agg(expr("collect_list(movieId) as movies"))

In [41]:
perUserActualvPred = perUserActual.join(perUserPredictions, ["userId"])\
    .rdd.map(lambda row: (row[1], row[2][:15]))
ranks = RankingMetrics(perUserActualvPred)

In [42]:
ranks.meanAveragePrecision

0.29497233535695083

In [43]:
ranks.precisionAt(5)

0.5846153846153845