# Chapter 3: Recommending Music and the Audioscrobbler Data Set

## Preparing the Data

In [1]:
val rawUserArtistData = spark.read.textFile("../data/recommender/user_artist_data_small.txt")

rawUserArtistData = [value: string]


[value: string]

In [2]:
val userArtistDF = rawUserArtistData.
    map{line => {
        val Array(user, artist, _*) = line.split(' ')
        (user.toInt, artist.toInt)
    }}.toDF("user", "artist")

userArtistDF = [user: int, artist: int]


[user: int, artist: int]

In [3]:
userArtistDF.show(10)

+-------+-------+
|   user| artist|
+-------+-------+
|1059637|1000010|
|1059637|1000049|
|1059637|1000056|
|1059637|1000062|
|1059637|1000094|
|1059637|1000112|
|1059637|1000113|
|1059637|1000114|
|1059637|1000123|
|1059637|1000130|
+-------+-------+
only showing top 10 rows



In [4]:
val artistByID = spark.read.textFile("../data/recommender/artist_data_small.txt").
    flatMap{line => {
        val (id, name) = line.span(_ != '\t')
        if (name.isEmpty) {
            None
        } else {
            try {
                Some((id.toInt, name.trim))
            } catch {
                case _: NumberFormatException => None
            }
        }
    }
}.toDF("id", "name")

artistByID = [id: int, name: string]


[id: int, name: string]

In [5]:
artistByID.show(10)

+-------+--------------------+
|     id|                name|
+-------+--------------------+
|1240105|        André Visior|
|1240113|           riow arai|
|1240132|Outkast & Rage Ag...|
|6776115|                小松正夫|
|1030848|      Raver's Nature|
|6671601|      Erguner, Kudsi|
|1106617|              Bloque|
|1240185|      Lexy & K. Paul|
|6671631|    Rev. W.M. Mosley|
|6671632|      Labelle, Patti|
+-------+--------------------+
only showing top 10 rows



In [6]:
val artistAlias = spark.read.textFile("../data/recommender/artist_alias_small.txt").
    flatMap{line => 
        val Array(artist, alias) = line.split('\t')
        if(artist.isEmpty){
            None
        } else{
            Some((artist.toInt, alias.toInt))
            
        }
    
    }.collect().toMap

artistAlias = Map(1039896 -> 1277013, 1199139 -> 166, 1047491 -> 1003342, 9929763 -> 1003778, 2025676 -> 1001141, 9929753 -> 1007347, 2103190 -> 1002909, 1005489 -> 2003588, 2009180 -> 6751847, 1261152 -> 1007206, 6801236 -> 1013362, 6843530 -> 1260159, 1038051 -> 6684730, 10107676 -> 118, 1008455 -> 1020, 1351048 -> 71, 6606757 -> 1003888, 2061602 -> 6748393, 1289246 -> 1023527, 2036732 -> 71, 6614668 -> 7006467, 1014175 -> 1014175, 1197558 -> 1001943, 1012315 -> 1238836, 9928967 -> 15, 1055562 -> 1276662, 1037848 -> 1007201, 6923988 -> 2140107, 6634844 -> 1018408, 1244994 -> 1028445, 1042508 -> 1008824, 2126687 -> 1023928, 6806131 -> 1002061, 1017671 -> 1015311, 1275359 -> 1287322, 1134651 -> 659, 6662497 -> 1327588, 1036747 -> 1239516, 1314530...


Map(1039896 -> 1277013, 1199139 -> 166, 1047491 -> 1003342, 9929763 -> 1003778, 2025676 -> 1001141, 9929753 -> 1007347, 2103190 -> 1002909, 1005489 -> 2003588, 2009180 -> 6751847, 1261152 -> 1007206, 6801236 -> 1013362, 6843530 -> 1260159, 1038051 -> 6684730, 10107676 -> 118, 1008455 -> 1020, 1351048 -> 71, 6606757 -> 1003888, 2061602 -> 6748393, 1289246 -> 1023527, 2036732 -> 71, 6614668 -> 7006467, 1014175 -> 1014175, 1197558 -> 1001943, 1012315 -> 1238836, 9928967 -> 15, 1055562 -> 1276662, 1037848 -> 1007201, 6923988 -> 2140107, 6634844 -> 1018408, 1244994 -> 1028445, 1042508 -> 1008824, 2126687 -> 1023928, 6806131 -> 1002061, 1017671 -> 1015311, 1275359 -> 1287322, 1134651 -> 659, 6662497 -> 1327588, 1036747 -> 1239516, 1314530 -> 1237371, 6812143 -> 1017092, 1059884 -> 1288727, 2067429 -> 1034635, 6708740 -> 1089337, 6718488 -> 1059264, 1059007 -> 2653, 1139033 -> 1011219, 1024502 -> 1009571, 1071257 -> 1236897, 1139806 -> 1039249, 6745885 -> 1268522, 1234344 -> 1012125, 6777696 

In [7]:
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.{DataFrame, Dataset}

In [8]:
def getCounts(rawUserArtistData: Dataset[String], 
              bArtistAlias:Broadcast[Map[Int, Int]]):DataFrame = {
    
    rawUserArtistData.map{line =>{
        val Array(user, artist, count) = line.split(" ").map(_.toInt)
        val finalArtist = bArtistAlias.value.getOrElse(artist, artist)
        (user, finalArtist, count)
    }}.toDF("user", "artist", "count")
    
}

getCounts: (rawUserArtistData: org.apache.spark.sql.Dataset[String], bArtistAlias: org.apache.spark.broadcast.Broadcast[Map[Int,Int]])org.apache.spark.sql.DataFrame


In [9]:
val bArtistAlias = sc.broadcast(artistAlias)

bArtistAlias = Broadcast(6)


Broadcast(6)

In [10]:
val trainData = getCounts(rawUserArtistData, bArtistAlias)
trainData.cache()
trainData.count()

trainData = [user: int, artist: int ... 1 more field]


49481

In [11]:
trainData.show()

+-------+-------+-----+
|   user| artist|count|
+-------+-------+-----+
|1059637|1000010|  238|
|1059637|1000049|    1|
|1059637|1000056|    1|
|1059637|1000062|   11|
|1059637|1000094|    1|
|1059637|1000112|  423|
|1059637|1000113|    5|
|1059637|1000114|    2|
|1059637|1000123|    2|
|1059637|1000130|19129|
|1059637|1000139|    4|
|1059637|1000241|  188|
|1059637|1000263|  180|
|1059637|1000289|    2|
|1059637|1000305|    1|
|1059637|1000320|   21|
|1059637|1000340|    1|
|1059637|1000427|   20|
|1059637|1000428|   12|
|1059637|1000433|   10|
+-------+-------+-----+
only showing top 20 rows



# Building a First Model

In [12]:
import org.apache.spark.ml.recommendation.{ALS, ALSModel}

In [13]:
val model = new ALS().
    setSeed(42).
    setImplicitPrefs(true).
    setRank(10).
    setRegParam(0.01).
    setAlpha(1.0).
    setMaxIter(5).
    setUserCol("user").
    setItemCol("artist").
    setRatingCol("count").
    setPredictionCol("prediction").
    fit(trainData)

model = als_36ce9b4bf90e


als_36ce9b4bf90e

# Spot Checking Recommendations

In [14]:
import org.apache.spark.sql.{functions => F}

import org.apache.spark.sql.{functions=>F}


In [15]:
val userID = 1059637
val existingArtistIDs = trainData.filter(F.col("user") === userID).select("artist").as[Int].collect()

userID = 1059637
existingArtistIDs = Array(1000010, 1000049, 1000056, 1000062, 1000094, 1000112, 1000113, 1000114, 1000123, 1000130, 1000139, 1000241, 1000263, 1000289, 1000305, 1000320, 1000340, 1000427, 1000428, 1000433, 1000445, 1000527, 1000617, 1000632, 1000676, 1000790, 1000877, 1000890, 1000926, 1000999, 1001007, 1001027, 1001066, 1001068, 1001107, 1001117, 1001130, 1001198, 1001233, 1001249, 1001412, 1001439, 1001482, 1001487, 1001523, 1001530, 1001779, 1001809, 1001828, 1001894, 1002095, 1002128, 1002204, 1002216, 1002223, 1002225, 1002269, 1002289, 1002326, 1002560, 1002584, 1034635, 1002723, 1002734, 1002742, 1002850, 1002912, 1003159, 1003176, 1003241, 1003250, 1003568, 1003673, 1003681, 1003689, 1003727, 1003794, 1003853, 1003928, 1004201, 1004226, 1004274,...


[1000010, 1000049, 1000056, 1000062, 1000094, 1000112, 1000113, 1000114, 1000123, 1000130, 1000139, 1000241, 1000263, 1000289, 1000305, 1000320, 1000340, 1000427, 1000428, 1000433, 1000445, 1000527, 1000617, 1000632, 1000676, 1000790, 1000877, 1000890, 1000926, 1000999, 1001007, 1001027, 1001066, 1001068, 1001107, 1001117, 1001130, 1001198, 1001233, 1001249, 1001412, 1001439, 1001482, 1001487, 1001523, 1001530, 1001779, 1001809, 1001828, 1001894, 1002095, 1002128, 1002204, 1002216, 1002223, 1002225, 1002269, 1002289, 1002326, 1002560, 1002584, 1034635, 1002723, 1002734, 1002742, 1002850, 1002912, 1003159, 1003176, 1003241, 1003250, 1003568, 1003673, 1003681, 1003689, 1003727, 1003794, 1003853, 1003928, 1004201, 1004226, 1004274, 1004278, 1004294, 1004296, 1004301, 1004342, 1004392, 1004484, 1004574, 1005222, 1005363, 1005990, 1006029, 1006113, 1006123, 1006185, 1006229, 1006230, 1006234, 1006245, 1006287, 1006354, 1006411, 1006594, 1006597, 1006607, 1006628, 1006631, 1006633, 1006657, 

In [16]:
artistByID.filter(F.col("id").isin(existingArtistIDs:_*)).show()

+-------+--------------------+
|     id|                name|
+-------+--------------------+
|1002584|                Nena|
|1247913|       JamisonParker|
|1257062|    The Spill Canvas|
|1257410|tomandandy (ft. K...|
|1260489|        The Exciters|
|1260572|    Nightmare Of You|
|1261496|              J-Kwon|
|6992072|               angle|
|   5496| Echo & the Bunnymen|
|1150039|        Letter Kills|
|1283493|            An Angle|
|1006354|      Pedro the Lion|
|1085052|             Cordero|
|     78|             Sublime|
|1233389|The American Anal...|
|1234850|         The Hollies|
|1009156|                 Mae|
|1003853|        Les Savy Fav|
|1044920|   Matchbook Romance|
|   5659|             Midtown|
+-------+--------------------+
only showing top 20 rows



In [17]:
def makeRecommendations(
    model: ALSModel,
    userID: Int,
    limit: Int): DataFrame = {
    
    val toRecomend = model.itemFactors.select(F.col("id").alias("artist")).
        withColumn("user", F.lit(userID))
    
    
    model.transform(toRecomend).
        select("artist", "prediction").
        orderBy(F.col("prediction").desc).
        limit(limit)
    
}

makeRecommendations: (model: org.apache.spark.ml.recommendation.ALSModel, userID: Int, limit: Int)org.apache.spark.sql.DataFrame


In [18]:
spark.conf.set("spark.sql.crossJoin.enabled", "true")
val topRecomendations = makeRecommendations(model, userID, 5)

topRecomendations = [artist: int, prediction: float]


[artist: int, prediction: float]

In [19]:
topRecomendations.show()

+-------+----------+
| artist|prediction|
+-------+----------+
|   2823| 1.7642742|
|   2884| 1.7638825|
|1003673| 1.5066037|
|1000597| 1.4267952|
|1008419| 1.2881987|
+-------+----------+



In [20]:
val recommendedArtistIDs = topRecomendations.select("artist").as[Int].collect()
artistByID.filter(F.col("id").isin(recommendedArtistIDs:_*)).show()

+-------+-----------------+
|     id|             name|
+-------+-----------------+
|1008419|        Nightwish|
|1000597|         Bon Jovi|
|   2823|Alanis Morissette|
|1003673|           Prince|
|   2884|      Chumbawamba|
+-------+-----------------+



recommendedArtistIDs = Array(2823, 2884, 1003673, 1000597, 1008419)


[2823, 2884, 1003673, 1000597, 1008419]

# Computiong AUC

In [21]:
val allData = getCounts(rawUserArtistData, bArtistAlias)
val Array(trainData, cvData) = allData.randomSplit(Array(0.9, 0.1))
trainData.cache()
cvData.cache()

allData = [user: int, artist: int ... 1 more field]
trainData = [user: int, artist: int ... 1 more field]
cvData = [user: int, artist: int ... 1 more field]


[user: int, artist: int ... 1 more field]

In [22]:
val allArtistsIDs = allData.select("artist").as[Int].distinct().collect()
val bAllArtistIDs = spark.sparkContext.broadcast(allArtistsIDs)

allArtistsIDs = Array(1048726, 463, 1281854, 1008081, 1014690, 1087384, 1091250, 1233083, 1346305, 6623644, 833, 1316951, 1245054, 6642786, 1001129, 10130219, 1245208, 1004552, 1010281, 1036659, 1062730, 1291109, 6663903, 6668762, 6814190, 1279698, 6723762, 1004021, 1007972, 1009031, 1012617, 1013212, 1014191, 1023660, 1028228, 1040057, 1041189, 1053084, 10729995, 1189991, 1203598, 1230694, 1239654, 1259455, 1261703, 1262404, 1266726, 1276692, 1277913, 2281411, 3175, 4935, 6642933, 6911438, 6696725, 1038390, 2099635, 6649067, 1029443, 2022896, 1012261, 1034510, 1019303, 1015250, 1016546, 1023841, 1084951, 1260023, 1829, 496, 1007334, 10402275, 1059283, 1123104, 1126726, 1160165, 1171406, 2025147, 2061170, 2079446, 2139904, 2143912, 2146392, 6604291, 6615149, 6630663, 6636337...


[1048726, 463, 1281854, 1008081, 1014690, 1087384, 1091250, 1233083, 1346305, 6623644, 833, 1316951, 1245054, 6642786, 1001129, 10130219, 1245208, 1004552, 1010281, 1036659, 1062730, 1291109, 6663903, 6668762, 6814190, 1279698, 6723762, 1004021, 1007972, 1009031, 1012617, 1013212, 1014191, 1023660, 1028228, 1040057, 1041189, 1053084, 10729995, 1189991, 1203598, 1230694, 1239654, 1259455, 1261703, 1262404, 1266726, 1276692, 1277913, 2281411, 3175, 4935, 6642933, 6911438, 6696725, 1038390, 2099635, 6649067, 1029443, 2022896, 1012261, 1034510, 1019303, 1015250, 1016546, 1023841, 1084951, 1260023, 1829, 496, 1007334, 10402275, 1059283, 1123104, 1126726, 1160165, 1171406, 2025147, 2061170, 2079446, 2139904, 2143912, 2146392, 6604291, 6615149, 6630663, 6636337, 6660529, 6703216, 6711334, 6745427, 6798543, 6838649, 6857355, 6863494, 6867283, 10522437, 10045976, 1012885, 10336587, 1099965, 1110591, 1150965, 1154409, 1239554, 1313764, 2068160, 2088333, 2095814, 2366, 3918, 6663492, 6663504, 666

In [23]:
val model = new ALS().
    setSeed(42).
    setImplicitPrefs(true).
    setRank(10).setRegParam(0.01).setAlpha(1.0).setMaxIter(5).
    setUserCol("user").setItemCol("artist").
    setRatingCol("count").setPredictionCol("prediction").
    fit(trainData)

model = als_7b1860bf4e2a


als_7b1860bf4e2a

In [24]:
import org.apache.spark.ml.evaluation.RegressionEvaluator

In [25]:
def getRsme(model: ALSModel, cvData: DataFrame):Double = {
    
    val predictions = model.transform(cvData).na.fill(0)
    
    val evaluator = new RegressionEvaluator()
        .setMetricName("rmse")
        .setLabelCol("count")
        .setPredictionCol("prediction")
    
    evaluator.evaluate(predictions)
    
}

getRsme: (model: org.apache.spark.ml.recommendation.ALSModel, cvData: org.apache.spark.sql.DataFrame)Double


In [26]:
getRsme(model, cvData)

664.1521671946169

# Hyperparameter Selection

In [27]:
val evaluations = {
    
    for(rank <- Seq(5, 30);
       regParam <- Seq(4.0, 0.0001);
       alpha <- Seq(1.0, 40.0)) yield {
        
        
        val model = new ALS().
        setSeed(42).
        setImplicitPrefs(true).
        setRank(rank).setRegParam(regParam).setAlpha(alpha).setMaxIter(5).
        setUserCol("user").setItemCol("artist").
        setRatingCol("count").setPredictionCol("prediction").
        fit(trainData)
        
        val rsme = getRsme(model, cvData)
        model.userFactors.unpersist()
        model.itemFactors.unpersist()
        (rsme, (rank, regParam, alpha))
    
        
    }
}

evaluations = List((664.1564475250708,(5,4.0,1.0)), (664.1697522870807,(5,4.0,40.0)), (664.1559307602741,(5,1.0E-4,1.0)), (664.1796397072688,(5,1.0E-4,40.0)), (664.1623153107811,(30,4.0,1.0)), (664.2079558123211,(30,4.0,40.0)), (664.1955296121143,(30,1.0E-4,1.0)), (664.2011800097232,(30,1.0E-4,40.0)))


List((664.1564475250708,(5,4.0,1.0)), (664.1697522870807,(5,4.0,40.0)), (664.1559307602741,(5,1.0E-4,1.0)), (664.1796397072688,(5,1.0E-4,40.0)), (664.1623153107811,(30,4.0,1.0)), (664.2079558123211,(30,4.0,40.0)), (664.1955296121143,(30,1.0E-4,1.0)), (664.2011800097232,(30,1.0E-4,40.0)))

# Making Recommendations

In [28]:
val someUsers = allData.select("user").as[Int].distinct().take(10)
val someRecommendations = someUsers.map{
    userID => (userID, makeRecommendations(model, userID, 5))
}

someUsers = Array(2007381, 1059637, 1073421, 2288164, 1001440, 1024631, 2030069, 1021940, 1052054, 2064012)
someRecommendations = Array((2007381,[artist: int, prediction: float]), (1059637,[artist: int, prediction: float]), (1073421,[artist: int, prediction: float]), (2288164,[artist: int, prediction: float]), (1001440,[artist: int, prediction: float]), (1024631,[artist: int, prediction: float]), (2030069,[artist: int, prediction: float]), (1021940,[artist: int, prediction: float]), (1052054,[artist: int, prediction: float]), (2064012,[artist: int, prediction: float]))


[(2007381,[artist: int, prediction: float]), (1059637,[artist: int, prediction: float]), (1073421,[artist: int, prediction: float]), (2288164,[artist: int, prediction: float]), (1001440,[artist: int, prediction: float]), (1024631,[artist: int, prediction: float]), (2030069,[artist: int, prediction: float]), (1021940,[artist: int, prediction: float]), (1052054,[artist: int, prediction: float]), (2064012,[artist: int, prediction: float])]

In [29]:
someRecommendations.foreach{case(userID, recsDF) => {
    
    val recommendedArtists = recsDF.select("artist").as[Int].collect()
    println(s"$userID -> ${recommendedArtists.mkString(", ")}")

}}

2007381 -> 15, 1001066, 1002840, 1270717, 1274
1059637 -> 2823, 2884, 1003673, 1000597, 1008419
1073421 -> 1233343, 1002647, 2823, 1270, 1080742
2288164 -> 1784, 1003352, 1006834, 1003447, 1004046
1001440 -> 1008337, 1000130, 1006834, 1006029, 1270639
1024631 -> 4192, 1001819, 1238230, 4371, 1784
2030069 -> 1195, 1002095, 1001588, 4221, 223
1021940 -> 1238230, 1001819, 15, 4192, 1784
1052054 -> 1195, 1002095, 1000263, 1002840, 1034635
2064012 -> 1000263, 1000985, 2231, 1002095, 1274
