In [1]:
import numpy as np
import sklearn
import csv
from collections import defaultdict

# Setup

In [2]:
class artist:
    def __init__(self, aid = -1, name = "", url = ""):
        self.aid = aid
        self.name = name
        self.url = url
        self.listens = 0
        self.user_listens = set()
        self.tags = defaultdict(int)
        self.weighted_tags = defaultdict(int)

class tag:
    def __init__(self, tid = 0, tag = "", uid = -1, aid = -1):
        self.uid = uid
        self.aid = aid
        self.tag = tag
        self.uid = uid
        self.aid = aid

class user:
    def __init__(self, uid = -1):
        self.uid = uid
        self.friends = set()
        self.artist_listens = {}
        self.listens = 0
        self.artist_tags = defaultdict(set)
        self.tags = defaultdict(int)
        self.weighted_tags = defaultdict(int)

In [3]:
with open('data/artists.dat') as csvf:
    data_artists = [row for row in csv.DictReader(csvf, delimiter = '\t')]
    print "data_artists: " + str(data_artists[0].keys())

with open('data/tags.dat') as csvf:
    data_tags = [row for row in csv.DictReader(csvf, delimiter = '\t')]
    print "data_tags: " + str(data_tags[0].keys())

with open('data/user_artists.dat') as csvf:
    data_user_artists = [row for row in csv.DictReader(csvf, delimiter = '\t')]
    print "data_user_artists: " + str(data_user_artists[0].keys())

with open('data/user_friends.dat') as csvf:
    data_user_friends = [row for row in csv.DictReader(csvf, delimiter = '\t')]
    print "data_user_friends: " + str(data_user_friends[0].keys())

with open('data/user_taggedartists.dat') as csvf:
    data_user_taggedartists = [row for row in csv.DictReader(csvf, delimiter = '\t')]
    print "data_user_taggedartists: " + str(data_user_taggedartists[0].keys())

data_artists: ['url', 'pictureURL', 'id', 'name']
data_tags: ['tagID', 'tagValue']
data_user_artists: ['artistID', 'userID', 'weight']
data_user_friends: ['userID', 'friendID']
data_user_taggedartists: ['tagID', 'userID', 'month', 'artistID', 'year', 'day']


In [4]:
np.random.seed(1)
np.random.shuffle(data_artists)
np.random.shuffle(data_tags)
np.random.shuffle(data_user_artists)
np.random.shuffle(data_user_friends)
np.random.shuffle(data_user_taggedartists)

In [5]:
# first, get all the unique users
users = {}
# for d in data_user_artists: users[d['userID']] = user(d['userID'])
# for d in data_user_friends: users[d['userID']] = user(d['userID'])
# for d in data_user_taggedartists: users[d['userID']] = user(d['userID'])
# then, get all the unique artists
artists = {d['id']: artist(d['id'], d['name'], d['url']) for d in data_artists}
# finally, get all the unique tags
tags = {d['tagID']: tag(d['tagID'], d['tagValue']) for d in data_tags}
    
train = {
    'set': data_user_artists[:int(len(data_user_artists) * .8)],
    'users': {},
    'artists': {}
}
test = {
    'set': data_user_artists[int(len(data_user_artists) * .8):int(len(data_user_artists) * .88)],
    'users': {},
    'artists': {}
}
valid = {
    'set': data_user_artists[int(len(data_user_artists) * .88):],
    'users': {},
    'artists': {}
}

for a in [train, test, valid]:
    for s in a['set']:
        uid = s['userID']
        aid = s['artistID']
        if uid not in users:
            users[uid] = user(uid)
        if uid not in a['users']:
            a['users'][uid] = user(uid)
        if aid not in a['artists']:
            a['artists'][aid] = artist(aid, artists[aid].name, artists[aid].url)
        if aid not in a['users'][uid].artist_listens:
            a['users'][uid].artist_listens[aid] = 0
        a['users'][uid].artist_listens[aid] = int(s['weight'])
        a['users'][uid].listens += int(s['weight'])
        a['artists'][aid].listens += int(s['weight'])
        a['artists'][aid].user_listens.add(uid)
        artists[aid].listens += int(s['weight'])

# build the rest of the data now that we know which users/artists are in each set
for d in data_user_friends:
    for a in [train, test, valid]:
        if d['userID'] in a['users'] and d['friendID'] in a['users']:
            a['users'][d['userID']].friends.add(d['friendID'])
            a['users'][d['friendID']].friends.add(d['userID'])
for d in data_user_taggedartists:
    for a in [train, test, valid]:
        if d['userID'] in a['users'] and d['artistID'] in a['artists']:
            t = tag(d['tagID'], tags[d['tagID']].tag, d['userID'], d['artistID'])
            a['users'][d['userID']].artist_tags[d['artistID']].add(t)
            a['users'][d['userID']].tags[d['tagID']] += 1
            if d['artistID'] in a['users'][d['userID']].artist_listens:
                a['users'][d['userID']].weighted_tags[d['tagID']] += a['users'][d['userID']].artist_listens[d['artistID']]
                a['artists'][d['artistID']].weighted_tags[d['tagID']] += a['users'][d['userID']].artist_listens[d['artistID']]
            a['artists'][d['artistID']].tags[d['tagID']] += 1

In [6]:
for a, s in [(train, 'training'), (test, 'testing'), (valid, 'validation')]:
    print "# %s users = %d" % (s, len(a['users'].values()))
    print "# %s artists = %d" % (s, len(a['artists'].values()))

# training users = 1892
# training artists = 15382
# testing users = 1840
# testing artists = 3475
# validation users = 1872
# validation artists = 4566


In [7]:
# grow the testing set with negative samples
testing_set = []
for uid, u in test['users'].iteritems():
    testing_set.extend([(uid, aid, True) for aid in u.artist_listens.keys()])
i = 0
lim = len(testing_set)
print "# of positive samples in test set: %d" % lim
while i < lim:
    uid = np.random.choice(users.keys())
    aid = np.random.choice(artists.keys())
    if uid in test['users'] and aid not in test['users'][uid].artist_listens:
        testing_set.append((uid, aid, False))
        i += 1
print "# of samples in test set: %d" % len(testing_set)

# of positive samples in test set: 7426
# of samples in test set: 14852


In [8]:
# grow the validation set with negative samples
validation_set = []
for uid, u in valid['users'].iteritems():
    validation_set.extend([(uid, aid, True) for aid in u.artist_listens.keys()])
i = 0
lim = len(validation_set)
print "# of positive samples in validation set: %d" % lim
while i < lim:
    uid = np.random.choice(users.keys())
    aid = np.random.choice(artists.keys())
    if uid in valid['users'] and aid not in valid['users'][uid].artist_listens:
        validation_set.append((uid, aid, False))
        i += 1
print "# of samples in validation set: %d" % len(validation_set)

# of positive samples in validation set: 11141
# of samples in validation set: 22282


In [9]:
# sort all the artists by most listened
top_artists = [(a.listens, a) for aid, a in train['artists'].iteritems()]
top_artists = sorted(top_artists, reverse = True)

In [10]:
train_avg_artists_per_user = np.mean([len(u.artist_listens.keys()) for _, u in train['users'].iteritems()])

In [16]:
def measure(pred_y, prefix, acc_only = False):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    correct = 0
    for p, y in pred_y:
        if p and y:
            tp += 1
            correct += 1
        elif p and not y:
            fp += 1
        elif not p and y:
            fn += 1
        else:
            tn += 1
            correct += 1

    # print accuracy
    acc = correct * 1. / len(pred_y)
    print "%s accuracy = %f" % (prefix, acc)
    
    if not acc_only:
        # print true positive/true negative/false positive/false negative
        print "%s TP | FP | TN | FN = %d | %d | %d | %d" % (prefix, tp, fp, tn, fn)

        # measure precision
        print "%s precision = %f" % (prefix, tp * 1. / (tp + fp))

        # measure recall
        print "%s recall = %f" % (prefix, tp * 1. / (tp + fn))

## baseline

In [27]:
# predict True if the artist is in the top 50 artists, and False otherwise
baseline_top_artists = set([a.aid for _, a in top_artists[:50]])
def baseline_predict(uid, aid):
    return aid in baseline_top_artists

baseline_v_pred = [(baseline_predict(uid, aid), y) for uid, aid, y in validation_set]
measure(baseline_v_pred, "baseline validation")
print
baseline_t_pred = [(baseline_predict(uid, aid), y) for uid, aid, y in testing_set]
measure(baseline_t_pred, "baseline test")

baseline validation accuracy = 0.570191
baseline validation TP | FP | TN | FN = 1600 | 36 | 11105 | 9541
baseline validation precision = 0.977995
baseline validation recall = 0.143614

baseline test accuracy = 0.570159
baseline test TP | FP | TN | FN = 1064 | 22 | 7404 | 6362
baseline test precision = 0.979742
baseline test recall = 0.143280


## model 1

In [28]:
# model #1
m1_top_artists = set(a.aid for _, a in top_artists[:50])
def m1_predict(uid, aid):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > train_avg_artists_per_user
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if any of this
    # user's friends have listened to this artist and False otherwise
    else:
        return len(train['users'][uid].friends & train['artists'][aid].user_listens) > 0

In [29]:
# model 1 predictions
m1_v_pred = [(m1_predict(uid, aid), y) for uid, aid, y in validation_set]
measure(m1_v_pred, "model 1 validation")

model 1 validation accuracy = 0.684274
model 1 validation TP | FP | TN | FN = 5118 | 1012 | 10129 | 6023
model 1 validation precision = 0.834910
model 1 validation recall = 0.459384


In [30]:
# testing set performance
m1_t_pred = [(m1_predict(uid, aid), y) for uid, aid, y in testing_set]
measure(m1_t_pred, "model 1 test")

model 1 test accuracy = 0.682669
model 1 test TP | FP | TN | FN = 3416 | 703 | 6723 | 4010
model 1 test precision = 0.829328
model 1 test recall = 0.460005


## model 2

In [31]:
# model #2
m2_top_artists = set(a.aid for _, a in top_artists[:50])
def m2_predict(uid, aid, jac_thresh):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > 30
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if this user is
    # similar to any of the users that listened to this artist
    else:
        def jaccard(uid1, uid2):
            u1s = set(train['users'][uid1].artist_listens.keys())
            u2s = set(train['users'][uid2].artist_listens.keys())
            return len(u1s & u2s) * 1. / len(u1s | u2s)
        return any(jaccard(uid, uid2) > jac_thresh for uid2 in train['artists'][aid].user_listens)

In [32]:
# model 2 predictions
for jac_thresh in np.arange(0, 0.105, 0.005):
    m2_v_pred = [(m2_predict(uid, aid, jac_thresh), y) for uid, aid, y in validation_set]
    measure(m2_v_pred, "model 2 validation (jac_thresh = %f)" % jac_thresh, True)

model 2 validation (jac_thresh = 0.000000) accuracy = 0.707656
model 2 validation (jac_thresh = 0.005000) accuracy = 0.707656
model 2 validation (jac_thresh = 0.010000) accuracy = 0.707656
model 2 validation (jac_thresh = 0.015000) accuracy = 0.779104
model 2 validation (jac_thresh = 0.020000) accuracy = 0.779867
model 2 validation (jac_thresh = 0.025000) accuracy = 0.794632
model 2 validation (jac_thresh = 0.030000) accuracy = 0.807603
model 2 validation (jac_thresh = 0.035000) accuracy = 0.808231
model 2 validation (jac_thresh = 0.040000) accuracy = 0.814379
model 2 validation (jac_thresh = 0.045000) accuracy = 0.811956
model 2 validation (jac_thresh = 0.050000) accuracy = 0.812853
model 2 validation (jac_thresh = 0.055000) accuracy = 0.808769
model 2 validation (jac_thresh = 0.060000) accuracy = 0.805269
model 2 validation (jac_thresh = 0.065000) accuracy = 0.801409
model 2 validation (jac_thresh = 0.070000) accuracy = 0.792299
model 2 validation (jac_thresh = 0.075000) accuracy = 0

In [33]:
for jac_thresh in np.arange(0.035, 0.046, 0.001):
    m2_v_pred = [(m2_predict(uid, aid, jac_thresh), y) for uid, aid, y in validation_set]
    measure(m2_v_pred, "model 2 validation (jac_thresh = %f)" % jac_thresh, True)

model 2 validation (jac_thresh = 0.035000) accuracy = 0.808231
model 2 validation (jac_thresh = 0.036000) accuracy = 0.808904
model 2 validation (jac_thresh = 0.037000) accuracy = 0.810385
model 2 validation (jac_thresh = 0.038000) accuracy = 0.813527
model 2 validation (jac_thresh = 0.039000) accuracy = 0.813975
model 2 validation (jac_thresh = 0.040000) accuracy = 0.814379
model 2 validation (jac_thresh = 0.041000) accuracy = 0.814245
model 2 validation (jac_thresh = 0.042000) accuracy = 0.813751
model 2 validation (jac_thresh = 0.043000) accuracy = 0.813033
model 2 validation (jac_thresh = 0.044000) accuracy = 0.812584
model 2 validation (jac_thresh = 0.045000) accuracy = 0.811956


jac_thresh of **0.04** performs best on validation set.

In [34]:
m2_t_pred = [(m2_predict(uid, aid, 0.04), y) for uid, aid, y in testing_set]
measure(m2_t_pred, "model 2 test")

model 2 test accuracy = 0.813022
model 2 test TP | FP | TN | FN = 6590 | 1941 | 5485 | 836
model 2 test precision = 0.772477
model 2 test recall = 0.887423


## model 3

In [35]:
# model #3
m3_top_artists = set(a.aid for _, a in top_artists[:50])
def m3_predict(uid, aid, top_lim):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > train_avg_artists_per_user
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if this user
    # has previously tagged any artist with the tags attributed to this artist by other users (tags sorted by frequency
    # for both user and artist)
    else:
        utags = set([t for f, t in sorted([(f, t) for t, f in train['users'][uid].tags.iteritems()], reverse = True)][:top_lim])
        atags = set([t for f, t in sorted([(f, t) for t, f in train['artists'][aid].tags.iteritems()], reverse = True)])
        return len(utags & atags) > 0

In [39]:
# model 3 predictions
for top_lim in range(0, 20):
    m3_v_pred = [(m3_predict(uid, aid, top_lim), y) for uid, aid, y in validation_set]
    measure(m3_v_pred, "model 3 validation (top_lim = %d)" % top_lim, True)

model 3 validation (top_lim = 0) accuracy = 0.493448
model 3 validation (top_lim = 1) accuracy = 0.638453
model 3 validation (top_lim = 2) accuracy = 0.683287
model 3 validation (top_lim = 3) accuracy = 0.708734
model 3 validation (top_lim = 4) accuracy = 0.718966
model 3 validation (top_lim = 5) accuracy = 0.725833
model 3 validation (top_lim = 6) accuracy = 0.729872
model 3 validation (top_lim = 7) accuracy = 0.732340
model 3 validation (top_lim = 8) accuracy = 0.734494
model 3 validation (top_lim = 9) accuracy = 0.736469
model 3 validation (top_lim = 10) accuracy = 0.737905
model 3 validation (top_lim = 11) accuracy = 0.738040
model 3 validation (top_lim = 12) accuracy = 0.737995
model 3 validation (top_lim = 13) accuracy = 0.738264
model 3 validation (top_lim = 14) accuracy = 0.738848
model 3 validation (top_lim = 15) accuracy = 0.738937
model 3 validation (top_lim = 16) accuracy = 0.739117
model 3 validation (top_lim = 17) accuracy = 0.738758
model 3 validation (top_lim = 18) accu

In [40]:
# model 3 predictions
m3_t_pred = [(m3_predict(uid, aid, 16), y) for uid, aid, y in testing_set]
measure(m3_t_pred, "model 3 test")

model 3 test accuracy = 0.738621
model 3 test TP | FP | TN | FN = 5110 | 1566 | 5860 | 2316
model 3 test precision = 0.765428
model 3 test recall = 0.688123


## model 4

In [41]:
# model #4
m4_top_artists = set(a.aid for _, a in top_artists[:50])
def m4_predict(uid, aid, top_lim):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > train_avg_artists_per_user
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if this user
    # has previously tagged any artist with the tags attributed to this artist by other users (tags sorted by number of
    # listens by the users who attributed the given tag)
    else:
        utags = set([t for f, t in sorted([(f, t) for t, f in train['users'][uid].weighted_tags.iteritems()], reverse = True)][:top_lim])
        atags = set([t for f, t in sorted([(f, t) for t, f in train['artists'][aid].weighted_tags.iteritems()], reverse = True)])
        return len(utags & atags) > 0

In [42]:
# model 4 predictions
for top_lim in range(15, 31):
    m4_v_pred = [(m4_predict(uid, aid, top_lim), y) for uid, aid, y in validation_set]
    measure(m4_v_pred, "model 4 validation (top_lim = %d)" % top_lim, True)

model 4 validation (top_lim = 15) accuracy = 0.680549
model 4 validation (top_lim = 16) accuracy = 0.681492
model 4 validation (top_lim = 17) accuracy = 0.682300
model 4 validation (top_lim = 18) accuracy = 0.682479
model 4 validation (top_lim = 19) accuracy = 0.682793
model 4 validation (top_lim = 20) accuracy = 0.682973
model 4 validation (top_lim = 21) accuracy = 0.683287
model 4 validation (top_lim = 22) accuracy = 0.683107
model 4 validation (top_lim = 23) accuracy = 0.683332
model 4 validation (top_lim = 24) accuracy = 0.683242
model 4 validation (top_lim = 25) accuracy = 0.683422
model 4 validation (top_lim = 26) accuracy = 0.683466
model 4 validation (top_lim = 27) accuracy = 0.683422
model 4 validation (top_lim = 28) accuracy = 0.683691
model 4 validation (top_lim = 29) accuracy = 0.683781
model 4 validation (top_lim = 30) accuracy = 0.683781


In [43]:
# model 3 predictions
m4_t_pred = [(m4_predict(uid, aid, 30), y) for uid, aid, y in testing_set]
measure(m4_t_pred, "model 4 test")

model 4 test accuracy = 0.681996
model 4 test TP | FP | TN | FN = 3652 | 949 | 6477 | 3774
model 4 test precision = 0.793740
model 4 test recall = 0.491786


## model 5

In [44]:
# model #5
m5_top_artists = set(a.aid for _, a in top_artists[:50])
def m5_predict(uid, aid, jac_thresh):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > train_avg_artists_per_user
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if this user is
    # similar to any of the users that listened to this artist
    else:
        def jaccard(uid1, uid2):
            u1s = set(train['users'][uid1].tags.keys())
            u2s = set(train['users'][uid2].tags.keys())
            if len(u1s | u2s) == 0:
                return 0
            return len(u1s & u2s) * 1. / len(u1s | u2s)
        return any(jaccard(uid, uid2) > jac_thresh for uid2 in train['artists'][aid].user_listens)

In [45]:
# model 5 predictions
for jac_thresh in np.arange(0, 0.55, 0.05):
    m5_v_pred = [(m5_predict(uid, aid, jac_thresh), y) for uid, aid, y in validation_set]
    measure(m5_v_pred, "model 5 validation (jac_thresh = %f)" % jac_thresh, True)

model 5 validation (jac_thresh = 0.000000) accuracy = 0.661521
model 5 validation (jac_thresh = 0.050000) accuracy = 0.715780
model 5 validation (jac_thresh = 0.100000) accuracy = 0.705771
model 5 validation (jac_thresh = 0.150000) accuracy = 0.666996
model 5 validation (jac_thresh = 0.200000) accuracy = 0.610223
model 5 validation (jac_thresh = 0.250000) accuracy = 0.566780
model 5 validation (jac_thresh = 0.300000) accuracy = 0.537160
model 5 validation (jac_thresh = 0.350000) accuracy = 0.516605
model 5 validation (jac_thresh = 0.400000) accuracy = 0.505386
model 5 validation (jac_thresh = 0.450000) accuracy = 0.500135
model 5 validation (jac_thresh = 0.500000) accuracy = 0.495736


In [47]:
# model 5 predictions
for jac_thresh in np.arange(0, 0.1, 0.01):
    m5_v_pred = [(m5_predict(uid, aid, jac_thresh), y) for uid, aid, y in validation_set]
    measure(m5_v_pred, "model 5 validation (jac_thresh = %f)" % jac_thresh, True)

model 5 validation (jac_thresh = 0.000000) accuracy = 0.661521
model 5 validation (jac_thresh = 0.010000) accuracy = 0.661521
model 5 validation (jac_thresh = 0.020000) accuracy = 0.680056
model 5 validation (jac_thresh = 0.030000) accuracy = 0.693654
model 5 validation (jac_thresh = 0.040000) accuracy = 0.708554
model 5 validation (jac_thresh = 0.050000) accuracy = 0.715780
model 5 validation (jac_thresh = 0.060000) accuracy = 0.719864
model 5 validation (jac_thresh = 0.070000) accuracy = 0.717934
model 5 validation (jac_thresh = 0.080000) accuracy = 0.715690
model 5 validation (jac_thresh = 0.090000) accuracy = 0.712548


In [48]:
m5_t_pred = [(m5_predict(uid, aid, 0.06), y) for uid, aid, y in testing_set]
measure(m5_t_pred, "model 5 test")

model 5 test accuracy = 0.717277
model 5 test TP | FP | TN | FN = 5142 | 1915 | 5511 | 2284
model 5 test precision = 0.728638
model 5 test recall = 0.692432


## model 6

In [49]:
# model #6
m6_top_artists = set(a.aid for _, a in top_artists[:50])
def m6_predict(uid, aid):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > train_avg_artists_per_user
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if any of this
    # user's friends have listened to this artist and False otherwise
    else:
        for uid2 in train['users'][uid].friends:
            if len(train['users'][uid2].friends & train['artists'][aid].user_listens) > 0:
                return True
        return len(train['users'][uid].friends & train['artists'][aid].user_listens) > 0

In [50]:
# model 6 predictions
m6_v_pred = [(m6_predict(uid, aid), y) for uid, aid, y in validation_set]
measure(m6_v_pred, "model 6 validation")

model 6 validation accuracy = 0.757697
model 6 validation TP | FP | TN | FN = 8234 | 2492 | 8649 | 2907
model 6 validation precision = 0.767667
model 6 validation recall = 0.739072


In [51]:
# model 6 predictions
m6_t_pred = [(m6_predict(uid, aid), y) for uid, aid, y in testing_set]
measure(m6_t_pred, "model 6 test")

model 6 test accuracy = 0.753838
model 6 test TP | FP | TN | FN = 5442 | 1672 | 5754 | 1984
model 6 test precision = 0.764970
model 6 test recall = 0.732831


## model 7

In [53]:
# model #7
m7_top_artists = set(a.aid for _, a in top_artists[:50])
def m7_predict(uid, aid, jac_thresh):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > train_avg_artists_per_user
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if any of this
    # user's friends have listened to this artist and False otherwise
    else:
        def jaccard(uid1, uid2):
            u1s = set(train['users'][uid1].artist_listens.keys())
            u2s = set(train['users'][uid2].artist_listens.keys())
            return len(u1s & u2s) * 1. / len(u1s | u2s)
        return any(jaccard(uid, uid2) > jac_thresh for uid2 in (train['artists'][aid].user_listens & train['users'][uid].friends))

In [54]:
# model 7 predictions
for jac_thresh in np.arange(0, 1, 0.05):
    m7_v_pred = [(m7_predict(uid, aid, jac_thresh), y) for uid, aid, y in validation_set]
    measure(m7_v_pred, "model 7 validation (jac_thresh = %f)" % jac_thresh, True)

model 7 validation (jac_thresh = 0.000000) accuracy = 0.681806
model 7 validation (jac_thresh = 0.050000) accuracy = 0.655462
model 7 validation (jac_thresh = 0.100000) accuracy = 0.605691
model 7 validation (jac_thresh = 0.150000) accuracy = 0.564536
model 7 validation (jac_thresh = 0.200000) accuracy = 0.534871
model 7 validation (jac_thresh = 0.250000) accuracy = 0.512521
model 7 validation (jac_thresh = 0.300000) accuracy = 0.500135
model 7 validation (jac_thresh = 0.350000) accuracy = 0.494211
model 7 validation (jac_thresh = 0.400000) accuracy = 0.493762
model 7 validation (jac_thresh = 0.450000) accuracy = 0.493448
model 7 validation (jac_thresh = 0.500000) accuracy = 0.493448
model 7 validation (jac_thresh = 0.550000) accuracy = 0.493448
model 7 validation (jac_thresh = 0.600000) accuracy = 0.493448
model 7 validation (jac_thresh = 0.650000) accuracy = 0.493448
model 7 validation (jac_thresh = 0.700000) accuracy = 0.493448
model 7 validation (jac_thresh = 0.750000) accuracy = 0

## model 8

In [56]:
# model #8
m8_top_artists = set(a.aid for _, a in top_artists[:50])
def m8_predict(uid, aid, top_lim):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > 30
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if any of this
    # user's friend's tags match the artist's tags, and if not, if any of the user's tags match the artist's tags
    else:
        for uid2 in train['users'][uid].friends:
            utags = set([t for f, t in sorted([(f, t) for t, f in train['users'][uid2].weighted_tags.iteritems()], reverse = True)][:top_lim])
            atags = set([t for f, t in sorted([(f, t) for t, f in train['artists'][aid].weighted_tags.iteritems()], reverse = True)])
            if len(utags & atags) > 0:
                return True
        utags = set([t for f, t in sorted([(f, t) for t, f in train['users'][uid].weighted_tags.iteritems()], reverse = True)][:top_lim])
        atags = set([t for f, t in sorted([(f, t) for t, f in train['artists'][aid].weighted_tags.iteritems()], reverse = True)])
        return len(utags & atags) > 0

In [57]:
# model 8 predictions
for top_lim in range(0, 20):
    m8_v_pred = [(m8_predict(uid, aid, top_lim), y) for uid, aid, y in validation_set]
    measure(m8_v_pred, "model 8 validation (top_lim = %d)" % top_lim, True)

model 8 validation (top_lim = 0) accuracy = 0.501840
model 8 validation (top_lim = 1) accuracy = 0.721883
model 8 validation (top_lim = 2) accuracy = 0.741630
model 8 validation (top_lim = 3) accuracy = 0.750381
model 8 validation (top_lim = 4) accuracy = 0.754780
model 8 validation (top_lim = 5) accuracy = 0.756979
model 8 validation (top_lim = 6) accuracy = 0.758505
model 8 validation (top_lim = 7) accuracy = 0.759088
model 8 validation (top_lim = 8) accuracy = 0.759851
model 8 validation (top_lim = 9) accuracy = 0.760075
model 8 validation (top_lim = 10) accuracy = 0.759627
model 8 validation (top_lim = 11) accuracy = 0.759088
model 8 validation (top_lim = 12) accuracy = 0.759223
model 8 validation (top_lim = 13) accuracy = 0.758639
model 8 validation (top_lim = 14) accuracy = 0.758550
model 8 validation (top_lim = 15) accuracy = 0.758101
model 8 validation (top_lim = 16) accuracy = 0.757921
model 8 validation (top_lim = 17) accuracy = 0.757831
model 8 validation (top_lim = 18) accu

In [58]:
m8_t_pred = [(m8_predict(uid, aid, 9), y) for uid, aid, y in testing_set]
measure(m8_t_pred, "model 8 test")

model 8 test accuracy = 0.759561
model 8 test TP | FP | TN | FN = 5778 | 1923 | 5503 | 1648
model 8 test precision = 0.750292
model 8 test recall = 0.778077


## model 9

In [60]:
# model #10
m9_top_artists = set(a.aid for _, a in top_artists[:50])
def m9_predict(uid, aid, jac_thresh):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > 30
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if this user is
    # similar to any of the users that listened to this artist
    else:
        def jaccard(u1s, u2s):
            if len(u1s | u2s) == 0: return 0
            return len(u1s & u2s) * 1. / len(u1s | u2s)
        for uid2 in train['artists'][aid].user_listens:
            u1s = set(train['users'][uid].artist_listens.keys())
            u2s = set(train['users'][uid2].artist_listens.keys())
            u1s2 = set(train['users'][uid].tags.keys())
            u2s2 = set(train['users'][uid2].tags.keys())
            jac = (jaccard(u1s, u2s) + jaccard(u1s2, u2s2)) / 2.
            if jac > jac_thresh:
                return True
        return False

In [61]:
# model 9 predictions
for jac_thresh in np.arange(0, 0.55, 0.05):
    m9_v_pred = [(m9_predict(uid, aid, jac_thresh), y) for uid, aid, y in validation_set]
    measure(m9_v_pred, "model 9 validation (jac_thresh = %f)" % jac_thresh, True)

model 9 validation (jac_thresh = 0.000000) accuracy = 0.640472
model 9 validation (jac_thresh = 0.050000) accuracy = 0.783592
model 9 validation (jac_thresh = 0.100000) accuracy = 0.725698
model 9 validation (jac_thresh = 0.150000) accuracy = 0.632080
model 9 validation (jac_thresh = 0.200000) accuracy = 0.558478
model 9 validation (jac_thresh = 0.250000) accuracy = 0.521228
model 9 validation (jac_thresh = 0.300000) accuracy = 0.508527
model 9 validation (jac_thresh = 0.350000) accuracy = 0.504712
model 9 validation (jac_thresh = 0.400000) accuracy = 0.503411
model 9 validation (jac_thresh = 0.450000) accuracy = 0.502872
model 9 validation (jac_thresh = 0.500000) accuracy = 0.502513


In [62]:
# model 9 predictions
for jac_thresh in np.arange(0, 0.11, 0.01):
    m9_v_pred = [(m9_predict(uid, aid, jac_thresh), y) for uid, aid, y in validation_set]
    measure(m9_v_pred, "model 9 validation (jac_thresh = %f)" % jac_thresh, True)

model 9 validation (jac_thresh = 0.000000) accuracy = 0.640472
model 9 validation (jac_thresh = 0.010000) accuracy = 0.692487
model 9 validation (jac_thresh = 0.020000) accuracy = 0.744188
model 9 validation (jac_thresh = 0.030000) accuracy = 0.770712
model 9 validation (jac_thresh = 0.040000) accuracy = 0.782336
model 9 validation (jac_thresh = 0.050000) accuracy = 0.783592
model 9 validation (jac_thresh = 0.060000) accuracy = 0.780585
model 9 validation (jac_thresh = 0.070000) accuracy = 0.772013
model 9 validation (jac_thresh = 0.080000) accuracy = 0.757652
model 9 validation (jac_thresh = 0.090000) accuracy = 0.742707
model 9 validation (jac_thresh = 0.100000) accuracy = 0.725698


## model 10

In [63]:
# model #10
m10_top_artists = set(a.aid for _, a in top_artists[:50])
def m10_predict(uid, aid, top_lim):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > 30
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if this user is
    # similar to any of the users that listened to this artist
    else:
        def jaccard(u1s, u2s):
            if len(u1s | u2s) == 0: return 0
            return len(u1s & u2s) * 1. / len(u1s | u2s)
        for uid2 in train['artists'][aid].user_listens:
            u1s = set(train['users'][uid].artist_listens.keys())
            u2s = set(train['users'][uid2].artist_listens.keys())
            u1s2 = set([t for f, t in sorted([(f, t) for t, f in train['users'][uid].tags.iteritems()], reverse = True)][:top_lim])
            u2s2 = set([t for f, t in sorted([(f, t) for t, f in train['users'][uid2].tags.iteritems()], reverse = True)][:top_lim])
            jac = (jaccard(u1s, u2s) + jaccard(u1s2, u2s2)) / 2.
            if jac > 0.04:
                return True
        return False

In [None]:
# model 10 predictions
for top_lim in range(17, 30):
    m10_v_pred = [(m10_predict(uid, aid, top_lim), y) for uid, aid, y in validation_set]
    measure(m10_v_pred, "model 10 validation (top_lim = %d)" % top_lim, True)

model 10 validation (top_lim = 17) accuracy = 0.778566
model 10 validation (top_lim = 18) accuracy = 0.777264
model 10 validation (top_lim = 19) accuracy = 0.776142
model 10 validation (top_lim = 20) accuracy = 0.775065
model 10 validation (top_lim = 21) accuracy = 0.778252
model 10 validation (top_lim = 22) accuracy = 0.778835
model 10 validation (top_lim = 23) accuracy = 0.778386
model 10 validation (top_lim = 24) accuracy = 0.779777
model 10 validation (top_lim = 25) accuracy = 0.779373


## model 11

In [519]:
# model #11
m11_top_artists = set(a.aid for _, a in top_artists[:50])
def m11_predict(uid, aid, top_lim):
    # if we haven't seen the user before, just return whether the artist is in the top 50 artists
    if uid not in train['users']:
        return aid in m1_top_artists
    
    # if we have seen the user before, but not the artist, return whether this user listens to a variety of artists
    # i.e. more than the average number of artists a user has listened to
    elif aid not in train['artists']:
        return len(train['users'][uid].artist_listens.keys()) > 30
    
    # if we've seen the user and the artist before, and this user has listened to this artist, return True
    elif aid in train['users'][uid].artist_listens:
        return True
    
    # if we've seen both the user and artist before, but have no prior listens recorded, return True if this user is
    # similar to any of the users that listened to this artist
    else:
        def jaccard(uid1, uid2):
            u1s = set([a for f, a in sorted([(f, a) for a, f in train['users'][uid].artist_listens.iteritems()], reverse = True)][:top_lim])
            u2s = set([a for f, a in sorted([(f, a) for a, f in train['users'][uid2].artist_listens.iteritems()], reverse = True)][:top_lim])
            if len(u1s | u2s) == 0: return 0
            return len(u1s & u2s) * 1. / len(u1s | u2s)
        return any(jaccard(uid, uid2) > 0.04 for uid2 in train['artists'][aid].user_listens)

In [522]:
for top_lim in range(15, 26):
    m11_v_pred = [(m11_predict(uid, aid, top_lim), y) for uid, aid, y in validation_set]
    m11_v_acc = sum([p == y for p, y in m11_v_pred]) * 1. / len(m11_v_pred)
    print "model 11 validation accuracy (top_lim = %d) = %f" % (top_lim, m11_v_acc)

model 11 validation accuracy (top_lim = 15) = 0.800557
model 11 validation accuracy (top_lim = 16) = 0.802800
model 11 validation accuracy (top_lim = 17) = 0.804551
model 11 validation accuracy (top_lim = 18) = 0.806481
model 11 validation accuracy (top_lim = 19) = 0.808365
model 11 validation accuracy (top_lim = 20) = 0.809667
model 11 validation accuracy (top_lim = 21) = 0.810924
model 11 validation accuracy (top_lim = 22) = 0.810071
model 11 validation accuracy (top_lim = 23) = 0.810250
model 11 validation accuracy (top_lim = 24) = 0.809577
model 11 validation accuracy (top_lim = 25) = 0.808410
