## <center> Artist Recommendation using Collaborative Filtering
By: Pranay Singla

In [1]:
# Loading libraries
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
pd.options.mode.chained_assignment = None

#### Data Cleaning and Wrangling

In [2]:
# reading lastfm data
df1 = pd.read_csv('usersha1-artmbid-artname-plays.tsv', sep = '\t', names = ['user','artist_id','artist','plays'])

In [3]:
df1.head()

Unnamed: 0,user,artist_id,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


In [4]:
# reading user profile data
user_profile = pd.read_csv('usersha1-profile.tsv', sep = '\t', names = ['user','gender','age','country','signup'])

In [5]:
user_profile

Unnamed: 0,user,gender,age,country,signup
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007"
1,00001411dc427966b17297bf4d69e7e193135d89,f,,Canada,"Dec 4, 2007"
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,,,Germany,"Sep 1, 2006"
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0,Mexico,"Apr 28, 2008"
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0,United States,"Jan 27, 2006"
...,...,...,...,...,...
359342,fffe7823f67b433b45f22056467db921c1d3d7d0,m,25.0,Germany,"Jun 24, 2006"
359343,fffe8637bd8234309e871409c7ebef99a720afc1,m,25.0,Brazil,"Sep 9, 2007"
359344,fffe8c7f952d9b960a56ed4dcb40a415d924b224,m,20.0,United States,"Aug 8, 2007"
359345,ffff9af9ae04d263dae91cb838b1f3a6725f5ffb,m,20.0,Russian Federation,"Dec 3, 2005"


In [6]:
# filtering for users only in India or US
user_ind_us = user_profile[user_profile['country'].isin(['United States','India'])]

In [7]:
full_data = pd.merge(df1, user_ind_us, how='inner',on = 'user')

In [8]:
# removing unknown artists
unknown_artist = ['unknown','[unknown]']

In [9]:
full_data = full_data[~full_data['artist'].isin(unknown_artist)]

In [10]:
full_data.head()

Unnamed: 0,user,artist_id,artist,plays,gender,age,country,signup
0,00007a47085b9aab8af55f52ec8846ac479ac4fe,0110e63e-0a9b-4818-af8e-41e180c20b9a,devendra banhart,456,m,28.0,United States,"Jan 27, 2006"
1,00007a47085b9aab8af55f52ec8846ac479ac4fe,69158f97-4c07-4c4e-baf8-4e4ab1ed666e,boards of canada,407,m,28.0,United States,"Jan 27, 2006"
2,00007a47085b9aab8af55f52ec8846ac479ac4fe,e162b2eb-6a42-4240-8c1b-c94d9a0acb73,cocorosie,386,m,28.0,United States,"Jan 27, 2006"
3,00007a47085b9aab8af55f52ec8846ac479ac4fe,f22942a1-6f70-4f48-866e-238cb2308fbd,aphex twin,213,m,28.0,United States,"Jan 27, 2006"
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,0c751690-c784-4a4f-b1e4-c1de27d47581,animal collective,203,m,28.0,United States,"Jan 27, 2006"


In [11]:
edit_df = full_data[['user','artist','plays']]

In [12]:
# removing na
edit_df.dropna(inplace=True)

In [13]:
edit_df['artist'] = edit_df['artist'].str.lower()

In [14]:
# number of unique artists
edit_df.artist.nunique()

115409

In [15]:
# reading Pranay's lastfm data
pranay = pd.read_csv('Pranay_Singla.csv', names=['artist','album','track','time'])

In [16]:
pranay = pranay[['artist','track']]

In [17]:
pranay_df = pranay.groupby(['artist','track'], as_index=False).size()

In [18]:
pranay_df['artist'] = pranay_df['artist'].str.lower()

In [19]:
my_artists = list(set(list(pranay_df['artist'])))

In [20]:
# number of artists in Pranay's data
len(my_artists)

191

In [21]:
all_artists = list(set(list(edit_df['artist'])))

In [23]:
remove_artist = []
for abc in my_artists:
    if abc not in all_artists:
        remove_artist.append(abc)

In [24]:
my_artist_new = [l for l in my_artists if l not in remove_artist]

In [25]:
# selecting only top 1000 artists for further analysis
top_1000 = edit_df.groupby('artist', as_index=False)['plays'].sum()

In [26]:
top_1000 = top_1000.sort_values('plays', ascending=False)

In [27]:
top_1000 = top_1000.iloc[:1000,:].reset_index(drop = True)

In [28]:
top_1000_artists = list(top_1000['artist'])

In [29]:
top_500_artists = top_1000_artists[:500]

In [30]:
# final list of artists
final_artists = list(set(top_500_artists + my_artist_new))

In [31]:
pranay_final = pranay_df[pranay_df['artist'].isin(final_artists)]

In [32]:
pranay_final = pranay_final.groupby('artist',as_index=False).sum('size')

In [33]:
pranay_final['user'] = 'Pranay_Singla'

In [34]:
pranay_final = pranay_final[['user','artist','size']]

In [35]:
pranay_final.columns = ['user','artist','plays']

In [36]:
pranay_final

Unnamed: 0,user,artist,plays
0,Pranay_Singla,*nsync,2
1,Pranay_Singla,a.r. rahman,1
2,Pranay_Singla,abba,2
3,Pranay_Singla,adele,2
4,Pranay_Singla,aerosmith,2
...,...,...,...
87,Pranay_Singla,the white stripes,3
88,Pranay_Singla,the who,9
89,Pranay_Singla,tom petty and the heartbreakers,2
90,Pranay_Singla,vengaboys,2


In [38]:
edit_2 = edit_df[edit_df['artist'].isin(final_artists)]

In [40]:
# combining the 2 datasets
combined_df = pd.concat([edit_2, pranay_final])

In [41]:
combined_df = combined_df.reset_index(drop = True)

In [42]:
combined_df

Unnamed: 0,user,artist,plays
0,00007a47085b9aab8af55f52ec8846ac479ac4fe,devendra banhart,456
1,00007a47085b9aab8af55f52ec8846ac479ac4fe,boards of canada,407
2,00007a47085b9aab8af55f52ec8846ac479ac4fe,cocorosie,386
3,00007a47085b9aab8af55f52ec8846ac479ac4fe,aphex twin,213
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,animal collective,203
...,...,...,...
1390415,Pranay_Singla,the white stripes,3
1390416,Pranay_Singla,the who,9
1390417,Pranay_Singla,tom petty and the heartbreakers,2
1390418,Pranay_Singla,vengaboys,2


In [43]:
# creating pivot table for analysis
lastfm = combined_df.pivot_table('plays','user','artist')

In [44]:
# replacing na's by 0
lastfm.fillna(0, inplace=True)

In [45]:
lastfm

artist,*nsync,2pac,3 doors down,30 seconds to mars,311,50 cent,a day to remember,a perfect circle,a tribe called quest,a.r. rahman,...,why?,wilco,wu-tang clan,yann tiersen,yeah yeah yeahs,yo la tengo,zz top,植松伸夫,菅野よう子,동방신기
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00007a47085b9aab8af55f52ec8846ac479ac4fe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,91.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0001a57568309b287363e72dc682e9a170ba6dc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00024b5b85c40f990c28644d53257819980bf6bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002dd2154072434d26e5409faa591bfb260a01e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00032c7933e0eb05f2258f1147ef81a90f2d4d6c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fff9dc65e7f2763a7e8bce8d99cc1491c2ae4c6f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fffa9294e858a7c863b5ad363c748c2330d9bd45,0.0,0.0,0.0,0.0,1228.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fffa9d62caff0f038c7a35db70f109b1bba04a1d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,322.0,0.0,0.0,0.0,0.0,230.0,0.0,0.0,0.0,0.0
fffaf6f9a1a3ad8bd0dff7b48b2eb9eef030fdee,0.0,0.0,0.0,0.0,0.0,0.0,0.0,354.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# replacing values > 0 by 1 to signify replace number of plays by simple artist-user mapping
lastfm_df = lastfm.apply(lambda x:[1 if y > 0 else y for y in x])

In [47]:
lastfm_df['my_artists_sum'] = lastfm_df[my_artist_new].sum(axis = 1)

In [48]:
# subset dataset for recommendation considering only those users who have atleast 4 common artist preferences
lastfm_subset = lastfm_df[lastfm_df['my_artists_sum'] > 4]

In [49]:
lastfm_subset.drop(['my_artists_sum'], inplace=True, axis = 1)

In [73]:
lastfm_df.drop(['my_artists_sum'], inplace = True, axis = 1)

In [50]:
lastfm_subset.to_csv('lastfm1.csv')

In [74]:
lastfm_df.to_csv('lastfm.csv')

## Collaborative Filtering

In [52]:
# Reading Saved File
lastfm_subset = pd.read_csv('lastfm1.csv')

In [54]:
data_items = lastfm_subset.drop('user', 1)

In [55]:
# calculating magnitude and dividing by the data to make sure each user has equal contribution
magnitude = np.sqrt(np.square(data_items).sum(axis=1))

In [56]:
data_items = data_items.divide(magnitude, axis='index')

In [57]:
# calculating similarity (cosine) for collaborative filtering
def calculate_similarity(data_items):
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

In [58]:
# Build the similarity matrix
data_matrix = calculate_similarity(data_items)

In [59]:
# top 11 similar artists for queen
print(data_matrix.loc['queen'].nlargest(11))

queen                 1.000000
the beatles           0.425528
pink floyd            0.341404
led zeppelin          0.335855
the who               0.311618
the rolling stones    0.298438
david bowie           0.297885
aerosmith             0.296539
billy joel            0.294072
ac/dc                 0.287834
elton john            0.273003
Name: queen, dtype: float64


In [60]:
# top 11 similar artists for rihanna
print(data_matrix.loc['rihanna'].nlargest(11))

rihanna               1.000000
beyoncé               0.477514
britney spears        0.456275
lady gaga             0.385008
mariah carey          0.369828
chris brown           0.364835
justin timberlake     0.355938
ne-yo                 0.341059
kelly clarkson        0.336007
christina aguilera    0.329707
kanye west            0.312174
Name: rihanna, dtype: float64


In [61]:
# top 11 similar artists for eminem
print(data_matrix.loc['eminem'].nlargest(11))

eminem              1.000000
50 cent             0.394701
2pac                0.368348
kanye west          0.363127
jay-z               0.360949
ludacris            0.354819
nas                 0.333485
t.i.                0.333454
linkin park         0.330362
the game            0.321912
notorious b.i.g.    0.316832
Name: eminem, dtype: float64


In [62]:
# top 11 similar artists for deadmau5
print(data_matrix.loc['deadmau5'].nlargest(11))

deadmau5                 1.000000
david guetta             0.170055
justice                  0.164870
daft punk                0.145766
the chemical brothers    0.121593
bastille                 0.119534
the prodigy              0.116139
röyksopp                 0.110895
infected mushroom        0.108269
dj antoine               0.102677
calvin harris            0.099177
Name: deadmau5, dtype: float64


In [64]:
# top 11 similar artists for a.r. rahman
print(data_matrix.loc['a.r. rahman'].nlargest(11))

a.r. rahman         1.000000
bryan adams         0.324203
kk                  0.286371
enrique iglesias    0.255081
rashid ali          0.255039
coldplay            0.170972
linkin park         0.166314
blue                0.163718
backstreet boys     0.135465
u2                  0.134447
pink floyd          0.124986
Name: a.r. rahman, dtype: float64


User Item Calculations

In [65]:
# Constructing a new dataframe with the 10 closest neighbours (most similar) for each artist.
data_neighbours = pd.DataFrame(index=data_matrix.columns, columns=range(1,11))
for i in range(0, len(data_matrix.columns)):
    data_neighbours.iloc[i,:10] = data_matrix.iloc[0:,i].sort_values(ascending=False)[:10].index

In [66]:
# Building artist recommendation for Pranay
user = 'Pranay_Singla'
user_index = lastfm_subset[lastfm_subset.user == user].index.tolist()[0]

In [67]:
# Getting the artists the user has played.
known_user_likes = data_items.iloc[user_index]
known_user_likes = known_user_likes[known_user_likes >0].index.values

In [68]:
# Constructing the neighbourhood from the most similar items to the ones our user has already liked.
most_similar_to_likes = data_neighbours.loc[known_user_likes]
similar_list = most_similar_to_likes.values.tolist()
similar_list = list(set([item for sublist in similar_list for item in sublist]))
neighbourhood = data_matrix[similar_list].loc[similar_list]

In [69]:
# A user vector containing only the neighbourhood items and the known user likes.
user_vector = data_items.iloc[user_index].loc[similar_list]

In [70]:
# Calculating the score.
score = neighbourhood.dot(user_vector).div(neighbourhood.sum(axis=1))
# Dropping known likes.
score = score.drop(known_user_likes)

In [79]:
print('Top recommendations for Pranay')
print()
print(score.nlargest(20))

Top recommendations for Pranay

eric clapton                0.038022
avril lavigne               0.037608
enya                        0.037544
michael jackson             0.037474
kylie minogue               0.037427
3 doors down                0.037276
electric light orchestra    0.037193
jethro tull                 0.037085
madonna                     0.036843
black eyed peas             0.036817
van halen                   0.036734
chris brown                 0.036730
billy joel                  0.036697
ac/dc                       0.036664
ne-yo                       0.036634
beyoncé                     0.036559
elton john                  0.036256
50 cent                     0.036205
boa                         0.036194
usher                       0.036137
dtype: float64


In [77]:
data_neighbours.to_csv('similar_artist.csv')