In [1]:
import pandas as pd
import numpy as np

### Importing Data

In [146]:
#Reading the 'plays' dataset, turning all items into integers
users = pd.DataFrame(np.array([i.strip().split("\t") for i in open('lastfm_data/user_artists.dat', 'r', encoding='latin-1').readlines()]),
                       columns = ['userID', 'artistID', 'plays'])
users = users.drop(users.index[0])
users = users.apply(pd.to_numeric)
users.head()

#Reading 'artists' dataset, splitting the first column into three columns on the newline, turning artistid into int
artists = pd.DataFrame(open('lastfm_data/artists.dat', 'r',  encoding='latin-1').readlines())
artists = pd.DataFrame(artists[0].str.split("\t").values.tolist(),columns = ['artistID', 'name', 'url', 'pictureURL'])
artists = artists.drop(artists.index[0])
artists = artists.drop(columns=['pictureURL'])
artists['artistID'] = artists['artistID'].apply(pd.to_numeric)


### Check how many artists each user has listened to

In [141]:
df = users.groupby('userID')['artistID'].nunique()
df.head()

userID
2    50
3    50
4    50
5    50
6    50
Name: artistID, dtype: int64

In [147]:
print(users.head())
print()
print(artists.head())

   userID  artistID  plays
1       2        51  13883
2       2        52  11690
3       2        53  11351
4       2        54  10300
5       2        55   8983

   artistID               name                                         url
1         1       MALICE MIZER       http://www.last.fm/music/MALICE+MIZER
2         2    Diary of Dreams    http://www.last.fm/music/Diary+of+Dreams
3         3  Carpathian Forest  http://www.last.fm/music/Carpathian+Forest
4         4       Moi dix Mois       http://www.last.fm/music/Moi+dix+Mois
5         5        Bella Morte        http://www.last.fm/music/Bella+Morte


### Creating The Identity Matrix

In [148]:
# Create a utility matrix A by pivoting ratings.df
A = users.pivot(index = 'userID', columns = 'artistID', values = 'plays').fillna(0)
A.head()

artistID,1,2,3,4,5,6,7,8,9,10,...,18736,18737,18738,18739,18740,18741,18742,18743,18744,18745
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Mean Normalization on Utility Matrix

In [149]:
# Mean normalize dataframe A to numpy array A_norm
A_values = A.values
ratings_mean = np.mean(A_values, axis = 1)
A_norm = A_values - ratings_mean.reshape(-1,1)
A_norm[:10]

#Now we have an array for each user that has the mean of all of their plays subtracted from each rating


array([[-9.56992967, -9.56992967, -9.56992967, ..., -9.56992967,
        -9.56992967, -9.56992967],
       [-1.16271552, -1.16271552, -1.16271552, ..., -1.16271552,
        -1.16271552, -1.16271552],
       [-1.52733666, -1.52733666, -1.52733666, ..., -1.52733666,
        -1.52733666, -1.52733666],
       ...,
       [-1.45173548, -1.45173548, -1.45173548, ..., -1.45173548,
        -1.45173548, -1.45173548],
       [-1.61768376, -1.61768376, -1.61768376, ..., -1.61768376,
        -1.61768376, -1.61768376],
       [-2.43421053, -2.43421053, -2.43421053, ..., -2.43421053,
        -2.43421053, -2.43421053]])

### Matrix Factorization with SVDs

Create three matrices from original utility matrix.
* A = original utility matrix
* u = user features. These are how each user likes each feature.
* s = weights, a diagonal matrix.
* vt = movie features. How relevant each feature is to each movie

In [150]:
from scipy.sparse.linalg import svds

u,s,vt = svds(A_norm, k=60)

In [151]:
vt

array([[ 1.15609499e-04,  4.39285977e-05,  7.05967897e-05, ...,
         7.74246550e-05,  7.74083766e-05, -3.33742812e-05],
       [-3.27055579e-04, -3.08358837e-04, -3.53696723e-04, ...,
        -3.58202589e-04, -3.58189553e-04, -4.77813789e-04],
       [-2.32818108e-04, -1.34890574e-04, -2.60540091e-04, ...,
        -2.69039616e-04, -2.69029214e-04, -5.68397824e-04],
       ...,
       [-7.35426873e-05, -7.28547628e-05, -7.30640600e-05, ...,
        -7.30611205e-05, -7.30610978e-05, -7.29865703e-05],
       [-8.40224751e-05, -8.04596671e-05, -8.40432493e-05, ...,
        -8.40391538e-05, -8.40391282e-05, -8.39885129e-05],
       [-6.55655412e-05, -8.42195866e-05, -8.97258839e-05, ...,
        -8.97156635e-05, -8.97156914e-05, -8.95932244e-05]])

### Creating diagonal matrix for sigma factors
This is how we provide weights for each user feature-to-movie relevance pairing.

Now we have a 2D diagonal matrix of 50x50

In [152]:
print(s, s.shape)
sigma = np.diag(s)
sigma, sigma.shape

[ 45580.59997173  46601.44346045  46866.06935504  47634.78360995
  49068.92385253  49489.63017684  51016.21086369  51512.18973387
  51892.78436588  52638.97272624  53152.325196    54617.21501538
  55053.55601408  55674.11736015  56428.86911665  57149.17061034
  57251.10442122  58263.90660131  58808.97338725  59930.78126213
  61956.5194954   64181.01701669  67665.18963723  68714.5581343
  69637.2346044   70194.84155807  72979.04715761  77421.04229318
  79502.62218135  81993.17103901  84487.95159216  86937.37316232
  96108.78635272  96609.97240506  99884.94765218 101301.09459364
 103000.98267071 103502.1770617  109469.49320176 114535.95034243
 115498.3513451  121365.37000331 124723.73606083 132824.51859405
 134758.71833846 140273.58505092 146172.98121334 151172.54630454
 171161.84123895 179649.27672706 183282.2335299  232682.90111541
 247464.67255507 255387.53438738 258227.2051966  269388.22289065
 317571.78846536 323526.36832677 324895.90434521 405527.41904943] (60,)


(array([[ 45580.59997173,      0.        ,      0.        , ...,
              0.        ,      0.        ,      0.        ],
        [     0.        ,  46601.44346045,      0.        , ...,
              0.        ,      0.        ,      0.        ],
        [     0.        ,      0.        ,  46866.06935504, ...,
              0.        ,      0.        ,      0.        ],
        ...,
        [     0.        ,      0.        ,      0.        , ...,
         323526.36832677,      0.        ,      0.        ],
        [     0.        ,      0.        ,      0.        , ...,
              0.        , 324895.90434521,      0.        ],
        [     0.        ,      0.        ,      0.        , ...,
              0.        ,      0.        , 405527.41904943]]), (60, 60))

### Making Predictions from the Decomposed Matrices

We now just multiply u, s, and vt



In [153]:
predictions = np.dot(np.dot(u, sigma), vt) + ratings_mean.reshape(-1, 1)


In [154]:
predictions_df = pd.DataFrame(predictions, columns = A.columns)
predictions_df.head()

artistID,1,2,3,4,5,6,7,8,9,10,...,18736,18737,18738,18739,18740,18741,18742,18743,18744,18745
0,3.65459,7.138276,3.823821,3.91354,4.189498,3.381363,30.731426,5.800976,1.114285,2.832461,...,3.915284,3.884404,3.86969,3.866883,3.861172,3.859332,3.858655,3.857977,3.85788,3.705503
1,1.173797,1.201699,1.173851,1.173633,1.192049,1.208117,0.935349,1.148363,1.196357,1.177371,...,1.17729,1.176013,1.175405,1.175289,1.175053,1.174977,1.174949,1.174921,1.174917,1.173811
2,0.608429,0.681441,0.408766,0.437475,0.543157,0.947715,-16.863199,0.766027,2.153568,1.724729,...,0.416733,0.412291,0.410174,0.40977,0.408949,0.408684,0.408587,0.408489,0.408475,0.404656
3,-0.033741,0.173981,-0.061615,-0.021302,0.08543,0.491645,0.12085,0.09305,3.063983,2.507536,...,-0.050836,-0.055908,-0.058324,-0.058786,-0.059724,-0.060026,-0.060137,-0.060248,-0.060264,0.266436
4,0.049237,0.046018,0.049468,0.049634,0.049319,0.046179,0.336372,0.042641,0.043525,0.047553,...,0.049366,0.049414,0.049436,0.04944,0.049449,0.049452,0.049453,0.049454,0.049454,0.0505


### Making Recommendations

In [157]:
# Recommending top movies not yet rated by user
def recommender(predictions_df, UID, artists_df, original_plays_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row = UID-1 # UID starts at 1, not 0
    sorted_predictions = predictions_df.iloc[user_row].sort_values(ascending = False)
    
    # Get the original user data and merge in the movie information 
   # user_data = original_ratings_df.iloc[user_row]
    user_data = original_plays_df[original_plays_df.userID == (UID)]
    user_full = user_data.merge(artists_df, how ='left', left_on = 'artistID', right_on = 'artistID').sort_values(['plays'], ascending = False)

    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    # ~ all MIDS that are NOT in user_full MIS
    recommendations = artists_df[~artists_df['artistID'].isin(user_full['artistID'])] \
                      .merge(pd.DataFrame(sorted_predictions).reset_index(), 
                             how = 'left', left_on = 'artistID', right_on = 'artistID') \
                      .rename(columns = {user_row: 'Predictions'}) \
                      .sort_values('Predictions', ascending = False) \
                      .iloc[:num_recommendations, :-1]
    return f"The top {num_recommendations} artists for user {UID} are: ", recommendations

In [160]:
recommender(predictions_df, 45, artists, users, 10)

('The top 10 artists for user 45 are: ',       artistID                 name  \
 322        344         Taylor Swift   
 300        318          Hilary Duff   
 281        294          Leona Lewis   
 1400      1459     Carrie Underwood   
 288        304      David Archuleta   
 653        681          Demi Lovato   
 675        704  The Pretty Reckless   
 473        498             Paramore   
 866        903        Amy Winehouse   
 498        523        Lindsay Lohan   
 
                                                url  
 322          http://www.last.fm/music/Taylor+Swift  
 300           http://www.last.fm/music/Hilary+Duff  
 281           http://www.last.fm/music/Leona+Lewis  
 1400     http://www.last.fm/music/Carrie+Underwood  
 288       http://www.last.fm/music/David+Archuleta  
 653           http://www.last.fm/music/Demi+Lovato  
 675   http://www.last.fm/music/The+Pretty+Reckless  
 473              http://www.last.fm/music/Paramore  
 866         http://www.last.fm/