In [2]:
import pandas as pd, numpy as np
# import matplotlib.pyplot as plt
from IPython.display import display
# %matplotlib inline
import pickle
# from time import sleep
from tqdm import tqdm
# import seaborn as sns
# import sqlite3
# con = sqlite3.connect("../lastfm_1k_sql")
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
X_train_2007 = pickle.load( open( "X_train_2007", "rb" ) ) 
X_test_2008 = pickle.load( open( "X_test_2008", "rb" ) ) 

In [None]:
# sidequest to build avg plays per user

# for usersha1 in a list with all the users:
#     # pull all artists/plays for a given user
#     sql = ("SELECT lastfm_360k_sql.usersha1, lastfm_360k_sql.artmbid, lastfm_360k_sql.artname, lastfm_360k_sql.plays "
#            "FROM lastfm_360k_sql "
#            "WHERE lastfm_360k_sql.usersha1 = '{0}' ").format(usersha)
#     user1 = pd.read_sql(sql, con=con)

# Build our train and test sets from 2007 and 2008 data
- Note that this code does not run off of github because it relies on a 4.5gb data file that GitHub cannot host.
- The general process followed below is to load Last.FM data for a user in two chunks - one in 2007, one in 2008.  These are our train/test splits which let us compare whether our top recommendations for a user in 2007 appear in their listened genres in 2008.

In [None]:
# select n random userids from lastfm_1k
def select_random_users(num_users):
    sql = ("SELECT DISTINCT userid "
           "FROM lastfm_1k_sql "
           "ORDER BY Random() "
           "LIMIT'{0}'").format(num_users)

    user_list = pd.read_sql(sql, con=con)
    return user_list

# build a unit vector genre list
def select_unique_genres():
    sql = ("SELECT DISTINCT genre "
           "FROM artist_genre_percent_sql "
           )

    genre_matrix = pd.read_sql(sql, con=con)
    return pd.DataFrame(genre_matrix, index=genre_matrix['genre']).T

genre_vector = select_unique_genres()

# pull all artists/plays for a given user
def artists_time(userid):
    sql = ("SELECT timestamp, artid "
           "FROM lastfm_1k_sql "
           "WHERE userid = '{0}' ").format(userid)
    user1 = pd.read_sql(sql, con=con)
    return user1

In [None]:
# split dataset by user-defined year (through may 5 2009)
# default should be comparing 2007 and 2008

def build_user_vector(userid, year1, test_values=False, full_matrix=False, scale_by_plays=True):
    print(userid)
    sql1 = ("SELECT artid, COUNT(lastfm_1k_sql.artid) AS plays "
           "FROM lastfm_1k_sql "
           "WHERE userid = '{0}' AND "
           "(timestamp >= '{1}-01-01T00:00:00.000' AND "
           "timestamp <= '{1}-12-31T23:59:59.999')"
           "GROUP BY artid ").format(userid, year1)
    user1 = pd.read_sql(sql1, con=con)
    
    # error check in case no data is returned for a given user in that time period
    if user1.shape == (0,2):
        return None
    
    # pull the user's artist/genre mix
    sql2 = ("SELECT artist_genre_percent_sql.artmbid as id, "
           "artist_genre_percent_sql.artname, "
           "artist_genre_percent_sql.genre, "
           "artist_genre_percent_sql.genre_percent "
           "FROM artist_genre_percent_sql "
           "WHERE id in (SELECT artid "
           "FROM lastfm_1k_sql "
           "WHERE userid = '{0}' AND "
           "(timestamp >= '{1}-01-01T00:00:00.000' AND "
           "timestamp <= '{1}-12-31T23:59:59.999')"
           "GROUP BY artid )").format(userid, year1)
    artist_genre = pd.read_sql(sql2, con=con)
    artist_genre.rename(columns={'id':'artmbid'}, inplace=True)
    
    # error check in case no data is returned for a given user in that time period
    if artist_genre.shape == (0,4):
        return None
    
    # pivot the matrix w/ genres
    joined_matrix = artist_genre.pivot_table(values='genre_percent',
                                                index='artmbid', 
                                                columns='genre', 
                                                dropna=False)

    # scale the user's genre preference by # of plays of that artist
    if scale_by_plays:
        for artmbid1 in user1.artid.values:
            num_plays = user1.plays[user1.artid == artmbid1].values
            try:
                joined_matrix.loc[artmbid1] *= num_plays
            except KeyError:
                pass
    
    # testing code. these should be left false for production
    if test_values:
        joined_matrix['e'] = joined_matrix.sum(axis=1)
        return joined_matrix['e'][~joined_matrix['e'].isnull()]
    if full_matrix:
        return joined_matrix

    else:
        # return a dictionary with total genre profile of the user, indexed by usersha1
        joined_matrix.loc[userid] = joined_matrix.sum()
        user_vector = joined_matrix.loc[userid]
        return user_vector.to_dict()    

In [None]:
# build genre taste matrix for specified number of random users w/ a train/test split

def build_taste_matrix(num_users, year1, year2):
    
    user_list_of_dicts_train = []
    index_list_train = []
    
    user_list_of_dicts_test = []
    index_list_test = []
    
    how_many = select_random_users(num_users)
    
    # build user vectors for each user. if no data is available in a year, skip
    for i in tqdm(range(len(how_many))):
        useri_vector = build_user_vector(how_many['userid'][i], year1)
        if useri_vector != None:
            user_list_of_dicts_train.append(useri_vector)
            index_list_train.append(how_many['userid'][i])
    X_train = pd.DataFrame(user_list_of_dicts_train, index=index_list_train)
    
    for i in tqdm(range(len(how_many))):
        useri_vector = build_user_vector(how_many['userid'][i], year2)
        if useri_vector != None:
            user_list_of_dicts_test.append(useri_vector)
            index_list_test.append(how_many['userid'][i])
    X_test = pd.DataFrame(user_list_of_dicts_test, index=index_list_test)
    
    # remove users that didn't listen to music in BOTH years
    train_index = set(X_train.index)
    test_index = set(X_test.index)
    final_columns = list(train_index & test_index)
    X_train = X_train[X_train.index.isin(final_columns)].fillna(0)
    X_test = X_test[X_test.index.isin(final_columns)].fillna(0)

    # name the index
    X_train.index.name = 'userid'
    X_test.index.name = 'userid'
    
    return X_train, X_test

In [None]:
# X_train_2007, X_test_2008 = build_taste_matrix(800, 2007, 2008)
# pickle.dump(X_train_2007, open('X_train_2007', "wb" ) )
# pickle.dump(X_test_2008, open('X_test_2008', "wb" ) )



# Run recommendations for everyone in our train set and check if they appear in test set
### Cosine Similarity w/ weighted recommendations

In [2]:
# pull all the relevant information for a random user or a user w/ given userid
def define_user(train, test, random=True, userid=None):
    
    # sample 1 user from our list
    if random:
        random_user = train.sample()
        random_usersha1 = random_user.index[0]
        random_user_test = test.loc[random_usersha1]
    else:
        random_user = train.loc[userid]
        random_user_test = test.loc[userid]
        random_usersha1 = userid
    
    # our user's vector and top genres for train and test sets
    vector = train.loc[random_usersha1][train.loc[random_usersha1] > 0]\
        .sort_values(ascending=False)
    vector_test = test.loc[random_usersha1][test.loc[random_usersha1] > 0]\
        .sort_values(ascending=False)
    return random_usersha1, random_user, vector, random_user_test, vector_test

# define_user(X_train_2007, X_test_2008)

In [None]:
# build a similarity matrix, select our random user, 
# remove genres the user has listened to, and output recommendations

def cosine_recommendations(no_accounting_for_taste, random_usersha1):

    users_sim = pd.DataFrame(cosine_similarity(no_accounting_for_taste, no_accounting_for_taste), 
                             columns=no_accounting_for_taste.index, 
                             index=no_accounting_for_taste.index)
    sim_user = users_sim[random_usersha1]
    user_mask = no_accounting_for_taste.loc[random_usersha1] < 0.01 # anything with zero hasn't been listened to

    user_genre_matrix = no_accounting_for_taste.loc[:, user_mask] # remove genres user has listened to
    user_genre_matrix = user_genre_matrix.multiply(sim_user, axis='rows') # weight genres by user similarity 
    user_genre_matrix = user_genre_matrix.drop(random_usersha1, axis = 0) # drop user from own recs
    user_recommend = user_genre_matrix.sum().sort_values(ascending=False) # sum genre totals and order
    return user_recommend

In [None]:
# jaccard function to compare our target users' train and test sets
def jaccard(train_recs, test_vecs):
    
    # standard jaccard setup
    a = set(train_recs.index)
    b = set(test_vecs.index)
    numerator = a.intersection(b)
    denominator = a.union(b)
    diff_items = len(b.difference(a))
    
    # if our number one train recommendation is the number one result in test set, return 1
    # error check here eliminates users with very low listens (<20 genres in a year)
    try:
        if test_vecs.head(20).index[0] == train_recs.head(20).index[0]:
            return 1, diff_items
    except IndexError:
        pass
    # otherwise, return the score and the number of dissimilar items out of 20
    return len(numerator)/len(denominator), diff_items

In [None]:
def eval_cos_recs(X_train_2007, X_test_2008):
    
    # initiate a blank dict to hold our results
    results_list = {}
    
    # loop over each user in the train set
    for user in tqdm(X_train_2007.index):
        
        # build the rec for a given user
        cos_recs_2007 = cosine_recommendations(X_train_2007, user)
        
        # pull other listening information for the given user
        random_usersha1, random_user, vector, random_user_test, vector_test = \
            define_user(X_train_2007, X_test_2008, random=False, userid=user)
        
        # compare 2007 recommendations ( = 2007 recs - 2007 listened) 
        # to 2008 new ( = 2008  listened - 2007 listened) to see if new == recommended

        test_mask = ~vector_test.index.isin(vector.index) # filter out 2007 listened from 2008 listened
        jaccard_results = jaccard(cos_recs_2007.head(20), vector_test[test_mask].head(20))
        results_list[user] = jaccard_results
        
        results_df = pd.DataFrame(results_list).T
        results_df.rename(columns={0:'jaccard',1:'diff items'}, inplace=True)
    return results_df

In [None]:
# results_list = eval_cos_recs(X_train_2007, X_test_2008)
# results_df = pd.DataFrame(results_list).T
# results_df.rename(columns={0:'jaccard',1:'diff items'}, inplace=True)
# pickle.dump(results_df, open('cos_results_df', "wb" ) )

In [None]:
cos_results_df

In [3]:
cos_results_df = pickle.load( open( "cos_results_df", "rb" ) ) 

hit_ratio = len(cos_results_df[cos_results_df['jaccard'] > 0]) / len(cos_results_df)
display("hit ratio", hit_ratio)

# number of bullseye hits out of 534
bullseye_ratio = len(cos_results_df[cos_results_df['jaccard'] == 1]) / len(cos_results_df)
display("bullseye ratio", bullseye_ratio)

bullseyes = len(cos_results_df[cos_results_df['jaccard'] == 1])
display("bullseyes", bullseyes)

'hit ratio'

0.5617977528089888

'bullseye ratio'

0.0149812734082397

'bullseyes'

8

### SVD w/ Cosine Similarity

In [3]:
# return all genres from a given user with the given threshold
def get_genres(row, no_accounting_for_taste, genre_thresh):
    genre_list = []
    for column in no_accounting_for_taste.columns:
        if no_accounting_for_taste.loc[row['userid']][column] > genre_thresh:
            genre_list.append(column)
    return genre_list

# return all genres from a given user that they ever listened to
def get_all_genres(userid, no_accounting_for_taste):
    genre_list = []
    for column in no_accounting_for_taste.columns:
        if no_accounting_for_taste.loc[userid][column] > 0:
            genre_list.append(column)
    return genre_list

# look up a userid by index
def user_lookup(user_index, no_accounting_for_taste):    
    return no_accounting_for_taste.iloc[user_index].name

In [4]:
# reducing from 11,188 genres down to 40 components still yields 97% variance explanation
# clearly, there are many genres that overlap, 
# and this should reduce some of that dimensionality

def SVD_results(no_accounting_for_taste, random_user, user_thresh, genre_thresh):
    n_components = 40
    SVD = TruncatedSVD(n_components)
    component_names = ["component_"+str(i+1) for i in range(n_components)]
    svd_matrix = SVD.fit_transform(no_accounting_for_taste)

    # transform the user's profile to the SVD and df our full user matrix
    svd_user = SVD.transform(random_user) 
    svd_df = pd.DataFrame(svd_matrix)

    # add a cosine similarity column between the random user and the full user list
    # at the moment this similarity takes into account ALL genres, even low-value ones. those are filtered out after 
    # similarity is calculated.  this likely affects who is similar, but only marginally
    svd_df['cosine_sim'] = cosine_similarity(svd_df, svd_user) 

    # pull the 10 most similar users.  note: 0 = our user
    results_matrix = svd_df[['cosine_sim']].sort_values('cosine_sim', ascending=False)[0:10] 

    try:
        # look up userids, add them, and append a list of their genres
        results_matrix['userid'] = results_matrix.index
        results_matrix['userid'] = results_matrix['userid'].apply(lambda x: user_lookup(x, no_accounting_for_taste))
        results_matrix['user_genres'] = results_matrix.apply(lambda x: get_genres(x, no_accounting_for_taste, genre_thresh), axis=1)
    except ValueError:
        return ['nothing here']
    
    # to reduce variance, we can compare the other user curated lists to the entire random_user_list
    # this should eliminate us recommending a genre to someone who has listened to it a little bit, below the threshold

    # pull the random user's full list, and everyone else in the top <<user_thresh>>
    random_user_full_list = set(get_all_genres(random_user.name, no_accounting_for_taste))
    all_other_list = set(results_matrix['user_genres'][1:user_thresh].sum())

    # all items appearing in our similar users' top lists and not in our random user's full list
    diff_full_list = all_other_list.difference(random_user_full_list)

    return diff_full_list

In [5]:
# jaccard function to compare our target user's train and test set
def jaccard_svd(train_recs, test_vecs):
    
    # standard jaccard setup
    a = set(train_recs)
    b = set(test_vecs.index)
    numerator = a.intersection(b)
    denominator = a.union(b)
    diff_items = len(b.difference(a))
    
    # return the score and the number of dissimilar items out of len(2008 listens),
    # with error check for no listens
    try:
        jaccard_score = len(numerator)/len(denominator)
    except ZeroDivisionError:
        jaccard_score = 0
    
    return jaccard_score, diff_items

In [6]:
def eval_svd_recs(X_train_2007, X_test_2008, user_threshold, genre_threshold):
    
    # initiate a blank dict to hold our results
    results_list = {}
    
    # loop over each user in the train set
    for user in tqdm(X_train_2007.index):
        
        # pull other listening information for the given user
        random_usersha1, random_user, vector, random_user_test, vector_test = \
            define_user(X_train_2007, X_test_2008, random=False, userid=user)
        
        # build the rec for a given user
        svd_recs_2007 = SVD_results(X_train_2007, random_user, user_thresh=user_threshold, genre_thresh=genre_threshold)
        
        # compare 2007 recommendations (which = 2007 recs - 2007 listened) 
        # to 2008 new (2008  listened - 2007 listened) to see if new == recommended

        test_mask = ~vector_test.index.isin(vector.index) # filter out 2007 listened from 2008 listened
        jaccard_results = jaccard_svd(svd_recs_2007, vector_test[test_mask])
        results_list[user] = jaccard_results
        
#         results_df = pd.DataFrame(results_list).T
#         results_df.rename(columns={0:'jaccard',1:'diff items'}, inplace=True)
#     return results_df
    return results_list

In [7]:
# set threshold for eliminating "bad" genres.  this is essentially our hyperparameter that filters for the 
# weighted number we've been carrying forward on our matrices, ranging from 0 to hundreds
# a threshold of 50 could mean 50 listens to an artist with 1 tag, or 100 listens to an artist with 2 equal tags, etc

# grid search over user_thresh=5, 10
# genre_thresh = 20, 30, 40, 50

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
# results_dict1 = {}
# for user_thresh in (5,10):
#     for genre_thresh in (20,30):
#         results_dict1[(user_thresh,genre_thresh)] = eval_svd_recs(
#                                                         X_train_2007, 
#                                                         X_test_2008, 
#                                                         user_thresh, 
#                                                         genre_thresh)

In [None]:
# pickle.dump(results_dict1, open('results_dict1', "wb" ) )

In [None]:
# results_dict2 = {}
# for user_thresh in (5,10):
#     for genre_thresh in (40,50):
#         results_dict2[(user_thresh,genre_thresh)] = eval_svd_recs(
#                                                         X_train_2007, 
#                                                         X_test_2008, 
#                                                         user_thresh, 
#                                                         genre_thresh)

In [None]:
# pickle.dump(results_dict2, open('results_dict2', "wb" ) )

In [8]:
# user_thresh = 10

# results_dict11 = {}
# for genre_thresh in (20,30):
#     results_dict11[(user_thresh,genre_thresh)] = eval_svd_recs(
#                                                         X_train_2007, 
#                                                         X_test_2008, 
#                                                         user_thresh, 
#                                                         genre_thresh)

100%|██████████| 534/534 [6:49:47<00:00, 46.04s/it]  
100%|██████████| 534/534 [6:57:10<00:00, 46.87s/it]  


In [9]:
# pickle.dump(results_dict11, open('results_dict11', "wb" ) )

In [10]:
# user_thresh = 5

# results_dict22 = {}
# for genre_thresh in (40,50):
#     results_dict22[(user_thresh,genre_thresh)] = eval_svd_recs(
#                                                         X_train_2007, 
#                                                         X_test_2008, 
#                                                         user_thresh, 
#                                                         genre_thresh)

 16%|█▌        | 83/534 [1:05:05<5:53:40, 47.05s/it]

KeyboardInterrupt: 

In [None]:
# pickle.dump(results_dict22, open('results_dict22', "wb" ) )

In [12]:
# get the ratios from each of my dataframes

def gimme_ratios(svd_results_dict):
    results_df = pd.DataFrame(svd_results_dict).T
    results_df.rename(columns={0:'jaccard',1:'diff items'}, inplace=True)
    hit_ratio = len(results_df[results_df['jaccard'] > 0]) / len(results_df)
    return results_df, hit_ratio

In [13]:
results_dict11 = pickle.load( open( "results_dict11", "rb" ) ) 
results_dict2 = pickle.load( open( "results_dict2", "rb" ) ) 

# df520, hit_ratio520 = gimme_ratios(results_dict1[(5,20)])
# df530, hit_ratio530 = gimme_ratios(results_dict1[(5,30)])
df540, hit_ratio540 = gimme_ratios(results_dict2[(5,40)])
df550, hit_ratio550 = gimme_ratios(results_dict2[(5,50)])
df1020, hit_ratio1020 = gimme_ratios(results_dict11[(10,20)])
df1030, hit_ratio1030 = gimme_ratios(results_dict11[(10,30)])
df1040, hit_ratio1040 = gimme_ratios(results_dict2[(10,40)])
df1050, hit_ratio1050 = gimme_ratios(results_dict2[(10,50)])

d = {
# df520: hit_ratio520,
# df530: hit_ratio530,
'h540': hit_ratio540,
'h550': hit_ratio550,
'h1020': hit_ratio1020,
'h1030': hit_ratio1030,
'h1040': hit_ratio1040,
'h1050': hit_ratio1050
}
df = pd.DataFrame(data=d, index=['hit_ratio'])
df

Unnamed: 0,h1020,h1030,h1040,h1050,h540,h550
hit_ratio,0.696629,0.59176,0.511236,0.432584,0.314607,0.2397
