In [ ]:
val a = 1

### Recommending Interesting Movies To Your Users

In [ ]:
:sh cat /opt/SparkDatasets/recommender/ratings.csv

In [ ]:
:sh cat /opt/SparkDatasets/recommender/ratings_header.csv

In [ ]:
case class RatingRecord (user_id:  Int,
                         movie_id: Int,
                         rating:   Double)

In [ ]:
import org.apache.spark.sql.{DataFrame, Dataset, Encoders}
import org.apache.spark.sql.types.StructType

val ratingRecordSchema: StructType = Encoders.product[RatingRecord].schema

val fileContainingRatings = "/opt/SparkDatasets/recommender/ratings.csv"

val rawRatingData: Dataset[RatingRecord] =
  sparkSession.read
              .schema(ratingRecordSchema)
              .csv(fileContainingRatings)
              .repartition(sparkSession.sparkContext.defaultParallelism)
              .as[RatingRecord]

rawRatingData.sort($"user_id", $"movie_id")

In [ ]:
val ratingData =
  rawRatingData.withColumn("newrating", $"rating" - 3.0)
               .drop($"rating")
               .withColumnRenamed("newrating", "rating")

ratingData.cache

ratingData.count

ratingData.sort($"user_id", $"movie_id")

In [ ]:
import org.apache.spark.ml.recommendation.{ALS, ALSModel}

val als: ALS =
  (new ALS).setMaxIter(5)
           .setRegParam(0.01)
           .setUserCol("user_id")
           .setItemCol("movie_id")
           .setRatingCol("rating")

In [ ]:
val model: ALSModel = als.fit(ratingData)

In [ ]:
val allRecsDF: DataFrame = model.recommendForAllUsers(10)

allRecsDF.cache

allRecsDF.count

val recsForNewUserDF: DataFrame = allRecsDF.filter($"user_id" === 9)

recsForNewUserDF

In [ ]:
import org.apache.spark.sql.{functions => f}

val moviesSeenDF: DataFrame =
  ratingData.groupBy($"user_id")
            .agg(f.collect_list($"movie_id").alias("movies_seen"))

moviesSeenDF.cache

moviesSeenDF.count

moviesSeenDF

In [ ]:
val joinDF: DataFrame = recsForNewUserDF.join(moviesSeenDF, "user_id")

joinDF.cache

joinDF.count

joinDF

In [ ]:
case class MovieRank (rec_movie_id: Int,
                      rank: Float)

In [ ]:
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.Row

def abc(recs: Seq[Row], b: Seq[Int]): Seq[MovieRank] = {
  val bSet = b.toSet
  recs.filterNot {
    case Row(movie_id: Int, rating: Float) => bSet.contains(movie_id) || rating <= 0
  }
  .map {
    case Row(movie_id: Int, rating: Float) => MovieRank(movie_id, rating)
  }
}
val abcUDF: UserDefinedFunction = f.udf(abc(_: Seq[Row], _: Seq[Int]))

In [ ]:
joinDF.withColumn("proper_recs", abcUDF($"recommendations", $"movies_seen"))

In [ ]:
val properRecsForNewUserDF: DataFrame =
  joinDF.withColumn("proper_recs", abcUDF($"recommendations", $"movies_seen"))
        .withColumn("proper_rec", f.explode($"proper_recs"))
        .withColumn("rec_movie_id", $"proper_rec.rec_movie_id")
        .withColumn("rank", $"proper_rec.rank")
        .select($"user_id", $"rec_movie_id", $"rank")
        .sort($"user_id", $"rank".desc)

properRecsForNewUserDF