In [423]:
%pip install surprise

Note: you may need to restart the kernel to use updated packages.


In [424]:
import pandas as pd
from surprise import Dataset,Reader,SVDpp, accuracy
from  pathlib import Path

In [425]:
df=pd.read_csv(Path("hetrec2011-lastfm-2k/user_artists.dat"),delimiter='\t',header=None)
df2=pd.read_csv(Path("hetrec2011-lastfm-2k/artists.dat"),delimiter='\t',header=None)
df2.columns=['artistID','name','url','pictureURL']
df.columns=['userID','artistID','weight']

In [426]:
df=df.iloc[1:]
df2=df2.iloc[1:]
df.reset_index(drop=True, inplace=True)
df.head()
df['weight']=df['weight'].astype(int)

## Collaborative Filtering (listen count)

### **model** - SVD++


In [427]:

#normalized_rating=min_rating+ max_listen_count−min_listen_count(listen_count−min_listen_count)×(max_rating−min_rating)
from sklearn.preprocessing import MinMaxScaler
import numpy as np
df['weight']=df['weight'].astype(int)
df['weight'] = np.log1p(df['weight'])
scaler=MinMaxScaler(feature_range=(1,5))
df['weight']=scaler.fit_transform(df[['weight']])

In [428]:
df.head()

Unnamed: 0,userID,artistID,weight
0,2,51,3.928868
1,2,52,3.871943
2,2,53,3.862199
3,2,54,3.83003
4,2,55,3.784734


In [429]:
reader=Reader(rating_scale=(1,5))

In [430]:
from surprise.model_selection import train_test_split


surprise_data= Dataset.load_from_df(df,reader=reader)
trainset, testset = train_test_split(surprise_data,test_size=0.25)

In [431]:
model=SVDpp()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x2b12b7ec0>

In [432]:
predictions=model.test(testset)
for x in predictions[:10]:
    print(x)

user: 255        item: 4865       r_ui = 2.39   est = 2.59   {'was_impossible': False}
user: 579        item: 15         r_ui = 2.58   est = 2.39   {'was_impossible': False}
user: 1855       item: 431        r_ui = 2.52   est = 2.36   {'was_impossible': False}
user: 50         item: 220        r_ui = 2.60   est = 2.83   {'was_impossible': False}
user: 1248       item: 498        r_ui = 2.03   est = 2.59   {'was_impossible': False}
user: 951        item: 11316      r_ui = 2.81   est = 2.84   {'was_impossible': False}
user: 1666       item: 1059       r_ui = 2.98   est = 3.18   {'was_impossible': False}
user: 1955       item: 65         r_ui = 1.94   est = 2.02   {'was_impossible': False}
user: 114        item: 1369       r_ui = 2.03   est = 2.37   {'was_impossible': False}
user: 1619       item: 377        r_ui = 2.66   est = 2.95   {'was_impossible': False}


In [433]:
accuracy.rmse(predictions)

RMSE: 0.2688


0.26883490919862374

In [434]:

def get_top_n_recommendations(predictions, n=10):
    # Mapping user_id to song recommendations
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        # print(true_r,est)
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Sort and return top n recommendations
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [435]:
top_n_recommendations = get_top_n_recommendations(predictions, n=10)


In [436]:
for artist_id, rating in top_n_recommendations['2']:
    print(df2[df2['artistID']==artist_id]['name'] )

62    Madonna
Name: name, dtype: object
68    Café Del Mar
Name: name, dtype: object
83    Gorillaz
Name: name, dtype: object
64    Sade
Name: name, dtype: object
85    Kosheen
Name: name, dtype: object
65    Moby
Name: name, dtype: object
66    Dido
Name: name, dtype: object
81    Katie Melua
Name: name, dtype: object
53    Goldfrapp
Name: name, dtype: object
49    Hooverphonic
Name: name, dtype: object


## Content Based Filtering


In [437]:
artist_metadata= pd.read_csv(Path('hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat'),delimiter='\t')

In [438]:
artist_metadata.drop(columns=['timestamp'])

Unnamed: 0,userID,artistID,tagID
0,2,52,13
1,2,52,15
2,2,52,18
3,2,52,21
4,2,52,41
...,...,...,...
186474,2100,16437,4
186475,2100,16437,292
186476,2100,16437,2087
186477,2100,16437,2801


In [439]:
import pandas as pd


grouped = artist_metadata.groupby('artistID')['tagID'].apply(list).reset_index()

grouped['tag_string'] = grouped['tagID'].apply(lambda x: ' '.join(map(str, x)))

print(grouped[['artistID', 'tag_string']])


       artistID                                         tag_string
0             1  552 1219 139 141 2850 139 141 179 541 139 141 ...
1             2  30 127 575 61 30 30 179 575 14 233 727 179 575...
2             3                4 4 2092 3706 4117 4122 4 3706 8882
3             4  139 141 1 7561 9640 1 73 139 141 179 541 7 139...
4             5                       179 190 575 575 612 575 1097
...         ...                                                ...
12518     18737                                  79 187 3660 25 67
12519     18739                         370 18 21 73 78 79 187 121
12520     18740                                              61 86
12521     18741                                           33 11861
12522     18744  13 14 15 32 62 79 187 565 574 2525 3081 3660 3...

[12523 rows x 2 columns]


In [440]:
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
#using faiss similarity measure to find similar songs to a specific song

tfidf = TfidfVectorizer()
tag_matrix = tfidf.fit_transform(grouped['tag_string']).toarray().astype('float32')


dimension = tag_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(tag_matrix)

def get_similar_songs(artist_id, k=5):
    artist_idx = grouped.index[grouped['artistID'] == artist_id].tolist()[0]
    query_vector = tag_matrix[artist_idx:artist_idx+1]

    distances, indices = index.search(query_vector, k)

    similar_songs = grouped.iloc[indices[0]]['artistID'].values
    return similar_songs

similar_songs= get_similar_songs(artist_id=52, k=10)
print("Similar Artists:", similar_songs)



Similar Artists: [   52    54    81  4108 18198  5200  4316   238  7821  3100]


In [441]:

for artist_id in similar_songs:
    print(df2[df2['artistID']==str(artist_id)]['name'] )

47    Morcheeba
Name: name, dtype: object
49    Hooverphonic
Name: name, dtype: object
76    Portishead
Name: name, dtype: object
4027    Emancipator
Name: name, dtype: object
17164    Noon
Name: name, dtype: object
5082    Kid Loco
Name: name, dtype: object
4233    Tricky
Name: name, dtype: object
233    Massive Attack
Name: name, dtype: object
7639    Mandalay
Name: name, dtype: object
3081    Natalie Walker
Name: name, dtype: object


## Hybrid Reccomendations

In [442]:
def hybrid_recommendations(user_id, n=10):
    cf_predictions = [iid for (iid, _) in get_top_n_recommendations(predictions, n)[user_id]]
    content_recs = []
    # to find similar songs to the top10 songs recommended for the user from collaborative filtering
    for artist_id in cf_predictions:
        content_recs.extend(get_similar_songs(int(artist_id), k=5))
    # print(cf_predictions)
    # print(content_recs)
    combined_recommendations = list(set(cf_predictions + content_recs))
    
    return combined_recommendations[:n]

In [443]:
for artist_id in hybrid_recommendations('2',10):
    print(df2[df2['artistID']==str(artist_id)]['name'] )

895    Amy Winehouse
Name: name, dtype: object
9980    Şevval Sam
Name: name, dtype: object
647    Hadise
Name: name, dtype: object
7638    Anthea
Name: name, dtype: object
66    Dido
Name: name, dtype: object
4031    Lamb
Name: name, dtype: object
1548    Joss Stone
Name: name, dtype: object
149    Radiohead
Name: name, dtype: object
14993    Nancy Wilson
Name: name, dtype: object
81    Katie Melua
Name: name, dtype: object
