In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pyspark'

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .getOrCreate()

    lines = spark.read.text("sample_movielens_ratings.txt").rdd
    parts = lines.map(lambda row: row.value.split("::"))
    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                         rating=float(p[2]), timestamp=int(p[3])))
    ratings = spark.createDataFrame(ratingsRDD)
    (training, test) = ratings.randomSplit([0.8, 0.2])

    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10)

    # Generate top 10 movie recommendations for a specified set of users
    users = ratings.select(als.getUserCol()).distinct().limit(3)
    userSubsetRecs = model.recommendForUserSubset(users, 10)
    # Generate top 10 user recommendations for a specified set of movies
    movies = ratings.select(als.getItemCol()).distinct().limit(3)
    movieSubSetRecs = model.recommendForItemSubset(movies, 10)

    userRecs.show()
    movieRecs.show()
    userSubsetRecs.show()
    movieSubSetRecs.show()

    spark.stop()

Root-mean-square error = 1.983048757799533
+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 4.541135}, ...|
|    10|[{7, 5.2663555}, ...|
|     0|[{39, 4.815112}, ...|
|     1|[{80, 4.130293}, ...|
|    21|[{29, 5.1546144},...|
|    11|[{23, 5.441157}, ...|
|    12|[{27, 5.166915}, ...|
|    22|[{85, 5.9985123},...|
|     2|[{93, 5.137864}, ...|
|    13|[{93, 3.269409}, ...|
|     3|[{75, 5.4319606},...|
|    23|[{49, 5.2146263},...|
|     4|[{53, 4.8758783},...|
|    24|[{90, 5.3957944},...|
|    14|[{41, 5.009279}, ...|
|     5|[{55, 4.159607}, ...|
|    15|[{27, 4.877583}, ...|
|    25|[{33, 3.9315107},...|
|    26|[{75, 6.1195173},...|
|     6|[{25, 5.1269073},...|
+------+--------------------+
only showing top 20 rows

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 4.5583105},...|
|     40|[{2, 3.5505679}, ...|
|     10|[{17, 3.7931275},...|
|     50|[{23, 4.083418},