In [1]:
import os
import sys
spark_home = 'D:/spark231hdp27'
os.environ['SPARK_HOME']= spark_home
os.environ['PYLIB']=os.environ['SPARK_HOME']+'/python/lib'
sys.path.insert(0,os.environ['PYLIB']+'/py4j-0.10.6-src.zip')
sys.path.insert(1,os.environ['PYLIB']+'/pyspark.zip')

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('RecommenderModel') \
.config('spark.warehouse.dir','/apps/hive/warehouse') \
.config('spark.driver.memory', '6G') \
.config('spark.sql.shuffle.partitions', 4) \
.enableHiveSupport().getOrCreate()

In [4]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'2.3.1'

In [5]:
sc = spark.sparkContext

In [6]:
sc.setLogLevel('ERROR')

In [7]:
# read the user and artist played data into a rdd
userArtistFileLoc = "D:/ufdata/audioscrobbler/user_artist_data.txt"
rawUserArtistData = sc.textFile(userArtistFileLoc)
print("Take a look at the loaded raw user and artist data")
rawUserArtistData.take(5)

Take a look at the loaded raw user and artist data


['1000002 1 55',
 '1000002 1000006 33',
 '1000002 1000007 8',
 '1000002 1000009 144',
 '1000002 1000010 314']

In [8]:
# create a dataframe of user and artist integer ids
from pyspark.ml.recommendation import *
userArtistDF = rawUserArtistData.map(lambda line: line.split(' ')) \
.map(lambda x: (int(x[0]), int(x[1]))).toDF(["user", "artist"]).cache()

In [9]:
# lets look at the min and max user and artist ids to verify that we are within the range for integer values
from pyspark.sql.functions import *
userArtistDF.agg(min(col("user")), max(col("user")), min(col("artist")), max(col("artist"))).show()

+---------+---------+-----------+-----------+
|min(user)|max(user)|min(artist)|max(artist)|
+---------+---------+-----------+-----------+
|       90|  2443548|          1|   10794401|
+---------+---------+-----------+-----------+



In [10]:
# we have the artist data which is ids and artist name
artistFileLoc = "D:/ufdata/audioscrobbler/artist_data.txt"
print("take a look at the artist data which we will use to take care of aliases")
rawArtistData = sc.textFile(artistFileLoc)
rawArtistData.take(5)

take a look at the artist data which we will use to take care of aliases


['1134999\t06Crazy Life',
 '6821360\tPang Nakarin',
 '10113088\tTerfel, Bartoli- Mozart: Don',
 '10151459\tThe Flaming Sidebur',
 '6826647\tBodenstandig 3000']

In [11]:
# need to weed out instances where the name is missing
# use the line below to see such instances  
# grep --color -P  '^\d+\s+$' artist_data.txt
# grep --color -P -v '^\d+' artist_data.txt | wc -l
# grep --color -P -v '\d+\s+\S+' artist_data.txt
rawArtistData.map(lambda x: x.split('\t')).filter(lambda x: len(x) < 2).count()

409

### Clean the artist data
### Should have a  tuple and the first part should be numeric

In [12]:
rawArtistData.map(lambda x: x.split('\t')).filter(lambda x: len(x) > 2).take(5)

[['10589651', 'q¬‹', ' [Tokyo Jihen]'],
 ['\x07ú',
  'F\x05Ù\x08\x90\x01â\x05\x12ýM\x01qøöû)ô\x13÷Bñ÷óÕð\x81ópò;ôêóÍô®õ\x19÷7ú\x97û[ÿ>ÿ\x8d\x014\x01à\x02I\x03\x8f\x05£\x05\x12\x07\x80\x06\xa0\x05y\x04Æ\x02',
  '\x01\x87'],
 ['10484061', 'Toshihiko Seki ', '5 (Sanzou)'],
 ['1153538', 'Donæt Stop', ' (Ballistic Bass remix)'],
 ['1161565',
  'ýùürüÇú@\x01ÍÿÝ\x05^\x06\x8a\x05Z\x08W\x02Q\x05çûúüandù)øßù:ø;þ>ýÙ',
  '\x9a\x08ì\x0e\x15\x0c\x85\x05É\x02u÷\x9b÷.ñ,ô\x86÷\x83úÉ\x02ý\x02¾\x05\x9c\x03³\x06%\x05E\x0b;\x0b\xad\x06º\x06i÷,÷ÁîCïeø0ú¥\x02\x94\x04<\x01|\x01U\x03Z\x02ï\x0cÝ\x0c±\x0e\x8d\x0f\x06\x024\x02#ô\x89ómñ|']]

In [13]:
# filter to tuples of length greater than one
# then 
artistByID =  rawArtistData.map(lambda x: x.split('\t')).filter(lambda x: len(x) > 1 and x[0].isdigit()) \
.map(lambda x: (int(x[0]), x[1])) \
.toDF(['id', 'name'])

In [14]:
artistByID.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [15]:
artistByID.count()

1848281

In [16]:
rawArtistFileLoc = "D:/ufdata/audioscrobbler/artist_alias.txt"
rawArtistAlias = sc.textFile(rawArtistFileLoc)
# grep --color -P  '^\s+' artist_alias.txt  | wc -l
# There are 2135 instances where we have an alias but no artist

In [17]:
artistAlias = dict(rawArtistAlias.map(lambda x: x.split('\t')) \
.filter(lambda x: len(x) > 1).collect())

In [18]:
# check the number of artist aliases
len(artistAlias)

190893

In [19]:
# broadcast the artis aliases
bArtistAlias = sc.broadcast(artistAlias)

In [20]:
# use the artist aliases for creating the data for the model
allData = rawUserArtistData.map (lambda line: line.split()) \
.map(lambda x: (int(x[0]), int(x[1]), int(x[2]))).map(lambda x: (x[0], 
    bArtistAlias.value.get(x[1], x[1]), x[2]) ) \
.toDF(["user", "artist", "count"]).cache()

print("The number of records in the training data " , allData.count())

The number of records in the training data  24296858


In [21]:
# split into training, cross validation
trainData, cvData = allData.randomSplit((0.9, 0.1))
trainData.cache()

DataFrame[user: bigint, artist: bigint, count: bigint]

In [22]:
trainData.show(2)

+----+------+-----+
|user|artist|count|
+----+------+-----+
|1014|    26|    5|
|1014|    28|    1|
+----+------+-----+
only showing top 2 rows



In [23]:
# intitalize and fit the model
from pyspark.ml.recommendation import ALS, ALSModel
model = ALS(). \
setSeed(100). \
setImplicitPrefs(True). \
setRank(10). \
setRegParam(0.01). \
setAlpha(1.0). \
setMaxIter(5). \
setUserCol("user"). \
setItemCol("artist"). \
setRatingCol("count"). \
setPredictionCol("prediction") \
.fit(trainData)

In [24]:
# see predictions for one user
userID = 2093760
existingArtistIDs = allData. \
filter(col('user') == userID).collect()
print(existingArtistIDs)

[Row(user=2093760, artist=1180, count=1), Row(user=2093760, artist=1255340, count=3), Row(user=2093760, artist=378, count=1), Row(user=2093760, artist=813, count=2), Row(user=2093760, artist=942, count=7)]


In [25]:
# create a method to find recommnedations for a specific user
spark.conf.set("spark.sql.crossJoin.enabled", True)
def makeRecommendations(model, userID, howMany): 
    toRecommend = model.itemFactors. \
      selectExpr("id as artist"). \
      withColumn("user", lit(userID))
    return model.transform(toRecommend). \
      select("artist", "prediction"). \
      orderBy(desc("prediction")). \
      limit(howMany)

In [26]:
# lets pick a user and check the predictions from the model for the user
usersToRecommend = model.userFactors.selectExpr("id as user").withColumn("artist", lit(2814))
usersToRecommend.show()
model.transform(usersToRecommend). select("user", "prediction"). \
orderBy(desc("prediction")). \
limit(50).show()

+----+------+
|user|artist|
+----+------+
|  90|  2814|
| 120|  2814|
| 340|  2814|
| 350|  2814|
| 770|  2814|
|3290|  2814|
|4370|  2814|
|4620|  2814|
|6060|  2814|
|6390|  2814|
|6760|  2814|
|6850|  2814|
|7010|  2814|
|7130|  2814|
|7290|  2814|
|7340|  2814|
|7400|  2814|
|7510|  2814|
|8500|  2814|
|9660|  2814|
+----+------+
only showing top 20 rows

+-------+----------+
|   user|prediction|
+-------+----------+
|1070932| 1.5861955|
|1071884| 1.3878194|
|2023977| 1.3705223|
|2157954| 1.3619362|
|2062355|  1.354971|
|1053071| 1.3533974|
|2074380| 1.3504577|
|2096140| 1.3492036|
|2050441|   1.34675|
|1064024| 1.3313961|
|2232946| 1.3237069|
|2071724| 1.3202682|
|2208970| 1.3144891|
|2281770| 1.3143957|
|1066703| 1.3120208|
|1066320|  1.310826|
|1038950| 1.3050749|
|1002794| 1.3005005|
|1073469| 1.2960229|
|2159806| 1.2947026|
+-------+----------+
only showing top 20 rows



In [27]:
topRecommendations = makeRecommendations(model, userID, 10)
print("Take a look at the top ten recommendations we get from the model")
topRecommendations.show()

Take a look at the top ten recommendations we get from the model
+-------+-----------+
| artist| prediction|
+-------+-----------+
|1001819|0.028906701|
|   2814|0.028640402|
|1300642| 0.02856603|
|   1811| 0.02854091|
|   4605|0.028469205|
|1004028|0.027916994|
|    829|0.027856477|
|1007614|  0.0278085|
|1003249|0.027603466|
|1037970|0.027352829|
+-------+-----------+



In [28]:
# take a look at the two low rank matrices which are created
# verify that the length of the item and user factors matches with distinct items(artists) and users respectively
print(model.itemFactors.count())
print(model.userFactors.count())
print(trainData.select('user').distinct().count())
print(trainData.select('artist').distinct().count())
# the number of features for each matrix is given by the low rank k - 10 which we chose to create the model
print(model.itemFactors.rdd.take(1))
print(model.userFactors.rdd.take(1))


1524572
147768
147768
1524572
[Row(id=30, features=[-0.017250942066311836, 0.0034171261359006166, 0.0668657124042511, 0.09413359314203262, 0.07495716214179993, 0.08585923165082932, 0.04622870683670044, 0.0946611613035202, 0.021862678229808807, 0.1051500141620636])]
[Row(id=90, features=[-0.4471117854118347, 0.43872979283332825, 1.248292326927185, 0.10280954092741013, -0.6020233631134033, 0.1628144085407257, -0.3863792419433594, 0.30432894825935364, 0.23583754897117615, 0.28717243671417236])]


In [29]:
model.userFactors

DataFrame[id: int, features: array<float>]

In [30]:
print("Look at the artist names that have been recommended")
recommendedArtistIDs = topRecommendations.select("artist").rdd.map(lambda x: x[0]).collect()
print(recommendedArtistIDs)
recommendedArtists = artistByID.filter(col("id").isin(recommendedArtistIDs))
recommendedArtists.show()

Look at the artist names that have been recommended
[1001819, 2814, 1300642, 1811, 4605, 1004028, 829, 1007614, 1003249, 1037970]
+-------+----------------+
|     id|            name|
+-------+----------------+
|1004028|Notorious B.I.G.|
|   2814|         50 Cent|
|   4605|      Snoop Dogg|
|    829|             Nas|
|1007614|           Jay-Z|
|1037970|      Kanye West|
|   1811|         Dr. Dre|
|1003249|        Ludacris|
|1001819|            2Pac|
|1300642|        The Game|
+-------+----------------+



In [31]:
# the recommendation produced is a dot product of the user features and item features
# first get the user features for the user id
userFeatures = model.userFactors.filter(col("id") == userID).select("features").rdd.map(lambda x: x[0]).collect()[0]
print(userFeatures)

bUserFeatures = sc.broadcast(userFeatures)

def calc_dotp(item_feats):
    import builtins
    return builtins.sum([x[0] * x[1] for x in zip(item_feats, bUserFeatures.value)])

model.itemFactors.rdd.map(lambda x: (x[0], x[1])).map(lambda x: (x[0], calc_dotp(x[1]))).sortBy(lambda x: -x[1]).take(10)
# we can see this is exactly the same as the top recommendations

[-0.024357499554753304, 0.06565383076667786, 0.02214849554002285, -0.03441890701651573, -0.021594589576125145, 0.03341227397322655, 8.417641947744414e-05, 0.07064484804868698, 0.020837431773543358, 0.001333817606791854]


[(1001819, 0.028906702835987032),
 (2814, 0.028640404053358007),
 (1300642, 0.028566032990835166),
 (1811, 0.028540910459667335),
 (4605, 0.02846920442881667),
 (1004028, 0.027916993969010817),
 (829, 0.027856476078662328),
 (1007614, 0.027808499263035684),
 (1003249, 0.02760346476073714),
 (1037970, 0.02735282710719393)]

###  BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
###  small AUC problems, and it would be inefficient, when a direct computation is available

### For carrying out the direct computations:
### Use the model to get the positive predictions for the user

### Create a set of "negative" products for each user. These are randomly chosen
### from among all of the other artists, excluding those that are "positive" for the user.

### Find the predicted scores for the negative dataset from the model

### Join positive and negative predictions and compute individual area under the curve
### as proportion of instances where positive prediction is greater than negative prediction

### Find the mean of the indvidual auc scores to get the mean auc

###  ==================================================================================
###  Indvidual computations first to be canned into a method that we use for model tuning
### ===================================================================================

In [35]:
# use model transform to get predictions
positivePredictions = model.transform(cvData.select('user', 'artist')).withColumnRenamed('prediction', 'positivePrediction')

In [78]:
# negative data - to get as many random artists not listened to as number of artists listened to
# get the distinct artists listened to by each user and the number
negativeData = cvData.select('user', 'artist').rdd.map(lambda x: (x[0],x[1])).groupByKey() \
.map(lambda x: (x[0], set(x[1]))).map(lambda x: (x[0], x[1], len(x[1])))
print(negativeData.take(2))

[(1014, {1016355, 1052005, 2885, 1020070, 1001607, 1011017, 1216424, 1004431, 1015697, 1097203, 1198804, 1004437, 1005877, 1005015, 9991320, 153, 10161277}, 17), (1000025, {1, 1000329, 1000333, 1236878, 1000340, 1001380, 813, 1001910, 1018807, 1001530, 1002427, 1003069, 1000639, 1004226, 1000656, 1235281, 3292, 1000674, 1000930, 1004515, 5477, 1004521, 1259, 4468, 1000056, 1000317}, 26)]


In [79]:
# create a broadcast variable for all artist ids to help in the flatmap that follow
allArtistIds = allData.select('artist').distinct().rdd.map(lambda x: x[0]).collect()
bAllArtistIds = sc.broadcast(allArtistIds)

In [40]:
# check the length of artist ids
len(bAllArtistIds.value)

1631028

In [80]:
# create a method to use in the flatmap to get user, artist pairs which for a user
# are different from the artists he/she has listened to
def negativePairs(user_artist_set):
    from random import randint
    artists_id_len = len(bAllArtistIds.value)
#     print(artists_id_len)
    rand_artist_ids = [bAllArtistIds.value[x] for x in [randint(0, artists_id_len - 1) for x in range(user_artist_set[2])]]
#     print(rand_artist_ids)
#     print(len(rand_artist_ids), user_artist_set[2])
    return [(user_artist_set[0], x) for x in rand_artist_ids if x not in user_artist_set[1]]

In [42]:
# verify negativePairs
print(negativePairs((1000025, {4609, 1000161, 1001412, 1000263, 1001735, 1001623, 1000024, 1262, 176, 1200, 688, 1001201, 4468, 1001363, 1023028, 1018807, 1000952, 1000123, 3292, 1000863}, 20)))

[(1000025, 10239908), (1000025, 1110511), (1000025, 10276798), (1000025, 2280205), (1000025, 10350940), (1000025, 10480773), (1000025, 10540439), (1000025, 1323147), (1000025, 2147128), (1000025, 10021921), (1000025, 1049907), (1000025, 6660806), (1000025, 2138468), (1000025, 1344335), (1000025, 10193716), (1000025, 10628593), (1000025, 6969532), (1000025, 6703063), (1000025, 1251888), (1000025, 10352638)]


In [43]:
# check the flatmap
negativeData.flatMap(lambda x: negativePairs(x)).take(2)

[(1014, 10752705), (1014, 10467083)]

In [44]:
# negativeDF here will create user, artist pairs for artists that users have not listened to
# the number will be as many as the number a user would have listened to 
negativeDF =negativeData.flatMap(lambda x: negativePairs(x)).toDF(['user', 'artist'])
negativeDF.show()

+-------+--------+
|   user|  artist|
+-------+--------+
|   1014|10373293|
|   1014|10027246|
|   1014|10567685|
|   1014| 6927271|
|   1014|10276537|
|   1014|10588881|
|   1014| 1331866|
|   1014| 2165753|
|   1014| 6630858|
|   1014| 2074775|
|   1014|10647418|
|   1014|10553401|
|   1014| 1254390|
|   1014| 9901969|
|   1014| 7002263|
|   1014|10156020|
|   1014| 6964306|
|1000025|10158319|
|1000025| 2053131|
|1000025|10391592|
+-------+--------+
only showing top 20 rows



In [45]:
# check the size 
negativeDF.count()

2430828

In [46]:
# use the model to find predictions for users for artists that they have not listened to
negativePredictions =  model.transform(negativeDF).selectExpr('user', 'artist', 'prediction as negativePrediction')
negativePredictions.show(2)

+-------+------+------------------+
|   user|artist|negativePrediction|
+-------+------+------------------+
|2105184|    13|        0.08011725|
|2131841|    13|        0.38101718|
+-------+------+------------------+
only showing top 2 rows



In [47]:
# join positive and negative predictions
joinedPredictions = positivePredictions.join(negativePredictions, "user"). \
select("user", "positivePrediction", "negativePrediction").cache()

In [48]:
# find total count for each pair
allCounts = joinedPredictions. \
groupBy("user").agg(count(lit("1")).alias("total")). \
select("user", "total")

In [50]:
# consider those instances where positive prediction is greater than negative prediction
# to be correct and find their count
correctCounts = joinedPredictions. \
filter("positivePrediction > negativePrediction"). \
groupBy("user").agg(count("user").alias("correct")). \
select("user", "correct")

In [51]:
# take a look at correct counts
correctCounts.show(5)

+----+-------+
|user|correct|
+----+-------+
| 384|    132|
| 727|   1026|
| 801|      1|
|1197|    608|
|1298|    399|
+----+-------+
only showing top 5 rows



In [52]:
# take correct / total as auc and the mean as the mean auc
meanAUC = allCounts.join(correctCounts, "user").\
selectExpr("user", "correct / total as auc"). \
agg(mean("auc"))
meanAUC.show()

+------------------+
|          avg(auc)|
+------------------+
|0.9128160130138483|
+------------------+



In [73]:
# combine the steps executed above into a method that can use the model and the dataframe
# to give us the mean area under the curve
def areaUnderCurve(positiveData, bAllArtistIds, predictFunction):
    
    positivePredictions = predictFunction(positiveData).selectExpr('user', 'artist', 'prediction as positivePrediction')
    
    negativeData = positiveData.select('user', 'artist').rdd.map(lambda x: (x[0],x[1])).groupByKey() \
    .map(lambda x: (x[0], set(x[1]))).map(lambda x: (x[0], x[1], len(x[1])))
    
    def negativePairs(user_artist_set):
        from random import randint
        artists_id_len = len(bAllArtistIds.value)
    #     print(artists_id_len)
        rand_artist_ids = [bAllArtistIds.value[x] for x in [randint(0, artists_id_len - 1) for x in range(user_artist_set[2])]]
    #     print(rand_artist_ids)
    #     print(len(rand_artist_ids), user_artist_set[2])
        return [(user_artist_set[0], x) for x in rand_artist_ids if x not in user_artist_set[1]]
    
    negativeDF =negativeData.flatMap(lambda x: negativePairs(x)).toDF(['user', 'artist'])

    negativePredictions =  predictFunction(negativeDF).selectExpr('user', 'artist', 'prediction as negativePrediction')
    
    joinedPredictions = positivePredictions.join(negativePredictions, "user"). \
    select("user", "positivePrediction", "negativePrediction").cache()
    
    allCounts = joinedPredictions. \
    groupBy("user").agg(count(lit("1")).alias("total")). \
    select("user", "total")

    correctCounts = joinedPredictions. \
    filter("positivePrediction > negativePrediction"). \
    groupBy("user").agg(count("user").alias("correct")). \
    select("user", "correct")
    
    meanAUC = allCounts.join(correctCounts, "user").\
    selectExpr("user", "correct / total as auc"). \
    agg(mean("auc"))
    
    return meanAUC

In [57]:
areaUnderCurve(cvData, bAllArtistIds, model.transform).rdd.map(lambda x: x[0]).collect()

[0.9128160130138483]

In [77]:
# check for different parameters
%%time
evaluations = []
for rank in [5, 30]:
    for regParam in [1.0, 0.0001]:
        for alpha in [1.0, 40.0]:
            model = ALS(). \
            setSeed(100). \
            setImplicitPrefs(True). \
            setRank(rank). \
            setRegParam(regParam). \
            setAlpha(alpha). \
            setMaxIter(20). \
            setUserCol('user'). \
            setItemCol('artist'). \
            setRatingCol('count'). \
            setPredictionCol('prediction'). \
            fit(trainData)
            
            auc = areaUnderCurve(cvData, bAllArtistIds, model.transform).rdd.map(lambda x: x[0]).collect()
            evaluations.append((auc, (rank, regParam, alpha)))

Wall time: 33min 36s


In [83]:
print(evaluations)

[([0.9147383340039935], (5, 1.0, 1.0)), ([0.9193732991073478], (5, 1.0, 40.0)), ([0.9118592887481435], (5, 0.0001, 1.0)), ([0.9184728075556955], (5, 0.0001, 40.0)), ([0.9138315379278659], (30, 1.0, 1.0)), ([0.9215144700377967], (30, 1.0, 40.0)), ([0.9065773223271238], (30, 0.0001, 1.0)), ([0.9210712227337515], (30, 0.0001, 40.0))]


In [84]:
sorted(evaluations, key = lambda x: -x[0][0])

[([0.9215144700377967], (30, 1.0, 40.0)),
 ([0.9210712227337515], (30, 0.0001, 40.0)),
 ([0.9193732991073478], (5, 1.0, 40.0)),
 ([0.9184728075556955], (5, 0.0001, 40.0)),
 ([0.9147383340039935], (5, 1.0, 1.0)),
 ([0.9138315379278659], (30, 1.0, 1.0)),
 ([0.9118592887481435], (5, 0.0001, 1.0)),
 ([0.9065773223271238], (30, 0.0001, 1.0))]