# Chapter 3: Recommending Music and the Audioscrobbler Data Set

## Preparing the Data

In [7]:
val userArtistDF = spark.read.textFile("../data/recommender/user_artist_data_small.txt").
    map{line => {
        val Array(user, artist, _*) = line.split(' ')
        (user.toInt, artist.toInt)
    }}.toDF("user", "artist")

userArtistDF = [user: int, artist: int]


[user: int, artist: int]

In [9]:
userArtistDF.show(10)

+-------+-------+
|   user| artist|
+-------+-------+
|1059637|1000010|
|1059637|1000049|
|1059637|1000056|
|1059637|1000062|
|1059637|1000094|
|1059637|1000112|
|1059637|1000113|
|1059637|1000114|
|1059637|1000123|
|1059637|1000130|
+-------+-------+
only showing top 10 rows



In [23]:
val userByID = spark.read.textFile("../data/recommender/artist_data_small.txt").
    flatMap{line => {
        val (id, name) = line.span(_ != '\t')
        if (name.isEmpty) {
            None
        } else {
            try {
                Some((id.toInt, name.trim))
            } catch {
                case _: NumberFormatException => None
            }
        }
    }
}.toDF("id", "name")

userByID = [id: int, name: string]


[id: int, name: string]

In [24]:
userByID.show(10)

+-------+--------------------+
|     id|                name|
+-------+--------------------+
|1240105|        André Visior|
|1240113|           riow arai|
|1240132|Outkast & Rage Ag...|
|6776115|                小松正夫|
|1030848|      Raver's Nature|
|6671601|      Erguner, Kudsi|
|1106617|              Bloque|
|1240185|      Lexy & K. Paul|
|6671631|    Rev. W.M. Mosley|
|6671632|      Labelle, Patti|
+-------+--------------------+
only showing top 10 rows



In [28]:
val artistAlias = spark.read.textFile("../data/recommender/artist_alias_small.txt").
    flatMap{line => 
        val Array(artist, alias) = line.split('\t')
        if(artist.isEmpty){
            None
        } else{
            Some((artist.toInt, alias.toInt))
            
        }
    
    }.collect().toMap

artistAlias = Map(1039896 -> 1277013, 1199139 -> 166, 1047491 -> 1003342, 9929763 -> 1003778, 2025676 -> 1001141, 9929753 -> 1007347, 2103190 -> 1002909, 1005489 -> 2003588, 2009180 -> 6751847, 1261152 -> 1007206, 6801236 -> 1013362, 6843530 -> 1260159, 1038051 -> 6684730, 10107676 -> 118, 1008455 -> 1020, 1351048 -> 71, 6606757 -> 1003888, 2061602 -> 6748393, 1289246 -> 1023527, 2036732 -> 71, 6614668 -> 7006467, 1014175 -> 1014175, 1197558 -> 1001943, 1012315 -> 1238836, 9928967 -> 15, 1055562 -> 1276662, 1037848 -> 1007201, 6923988 -> 2140107, 6634844 -> 1018408, 1244994 -> 1028445, 1042508 -> 1008824, 2126687 -> 1023928, 6806131 -> 1002061, 1017671 -> 1015311, 1275359 -> 1287322, 1134651 -> 659, 6662497 -> 1327588, 1036747 -> 1239516, 1314530...


Map(1039896 -> 1277013, 1199139 -> 166, 1047491 -> 1003342, 9929763 -> 1003778, 2025676 -> 1001141, 9929753 -> 1007347, 2103190 -> 1002909, 1005489 -> 2003588, 2009180 -> 6751847, 1261152 -> 1007206, 6801236 -> 1013362, 6843530 -> 1260159, 1038051 -> 6684730, 10107676 -> 118, 1008455 -> 1020, 1351048 -> 71, 6606757 -> 1003888, 2061602 -> 6748393, 1289246 -> 1023527, 2036732 -> 71, 6614668 -> 7006467, 1014175 -> 1014175, 1197558 -> 1001943, 1012315 -> 1238836, 9928967 -> 15, 1055562 -> 1276662, 1037848 -> 1007201, 6923988 -> 2140107, 6634844 -> 1018408, 1244994 -> 1028445, 1042508 -> 1008824, 2126687 -> 1023928, 6806131 -> 1002061, 1017671 -> 1015311, 1275359 -> 1287322, 1134651 -> 659, 6662497 -> 1327588, 1036747 -> 1239516, 1314530 -> 1237371, 6812143 -> 1017092, 1059884 -> 1288727, 2067429 -> 1034635, 6708740 -> 1089337, 6718488 -> 1059264, 1059007 -> 2653, 1139033 -> 1011219, 1024502 -> 1009571, 1071257 -> 1236897, 1139806 -> 1039249, 6745885 -> 1268522, 1234344 -> 1012125, 6777696 