In [5]:
import pandas as pd
import numpy as np

artist_data = pd.read_csv('datasets/ydata-ymusic-user-artist-rating.csv')
artist_name_data = pd.read_csv('datasets/ydata-ymusic-artist-names.csv')


In [6]:
#Data preprocessing
def data_preprocessing(artist_data):
    
    #to show popular artist
    avg_rating = pd.DataFrame(artist_data.groupby('artist_id')['rating'].mean().reset_index())
    avg_rating.sort_values('rating', ascending = False).head(20)
    
    #creating pivot table
    user_rating = pd.pivot_table(data=artist_data, values='rating', index='user_id', columns='artist_id')
    
    return user_rating

In [7]:
def data_cleaning(recommended_artist):
    
     #data cleaning and sorting
    recommended_artist_corr = pd.DataFrame(recommended_artist, columns=['Correlation'])
    recommended_artist_corr.dropna(inplace=True)
    recommended_artist_corr = recommended_artist_corr.sort_values('Correlation', ascending=False).reset_index()
    
    return recommended_artist_corr

In [11]:
def get_top_n_artist(recommended_artist_corr, n):
    
    #get top n artist list and artist names
    artist_list = recommended_artist_corr.head(n)['artist_id'].tolist()
    artist_names = artist_name_data[artist_name_data['artist_id'].isin(artist_list)]
    
    return artist_names

In [12]:
def collaborative_filtering(artist_data, artist_id, user_rating, n):
    
    #calling data preprocessing
    user_rating = data_preprocessing(artist_data)
    
    #artist_id 
    #see artists rated by user
    artist_user_rating = user_rating[artist_id]
    
    #applying correlation for the entire user rating dataframe
    #Collaborative Filtering Song Recommendation
    #Pearson Correlation
    recommended_artist = user_rating.corrwith(artist_user_rating)
    
    #data cleaning and sorting
    recommended_artist_corr = data_cleaning(recommended_artist)
    
    #get top n artist list and artist names
    artist_names = get_top_n_artist(recommended_artist_corr, n)
    
    return artist_names

In [13]:
recommended_artist_name = collaborative_filtering(artist_data,1058037, user_rating, 5)
recommended_artist_name

Unnamed: 0,artist_id,artist
15486,1015911,Lords Of Acid
17287,1017742,Melvins
34356,1035342,Borknagar
39172,1040238,Mindless Self Indulgence
39633,1040701,Speak No Evil


In [3]:
#preparing data table for analysis
user_rating = pd.pivot_table(data=artist_data, values='rating', index='user_id', columns='artist_id')
user_rating.head()

artist_id,24538,1000004,1000006,1000010,1000012,1000015,1000016,1000018,1000021,1000023,...,1101383,1101399,1101400,1101401,1101416,1101471,1101613,1101630,1101671,1101719
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [2]:
#calculating mean for every artist
avg_rating = pd.DataFrame(artist_data.groupby('artist_id')['rating'].mean().reset_index())
avg_rating.columns
avg_rating

Unnamed: 0,artist_id,rating
0,24538,90.000000
1,1000004,44.731183
2,1000006,56.207692
3,1000010,52.000000
4,1000012,43.884956
...,...,...
14153,1101471,5.555556
14154,1101613,90.000000
14155,1101630,7.454545
14156,1101671,68.292683


In [4]:
avg_rating.sort_values('rating', ascending = False).head(20)

Unnamed: 0,artist_id,rating
1563,1005698,255.0
11833,1068512,255.0
4654,1017725,255.0
9348,1038538,255.0
11596,1060726,255.0
343,1001221,255.0
11930,1072678,255.0
13883,1099694,255.0
11567,1059589,255.0
14079,1100722,255.0


In [5]:
#artist_id = 1005698
#see artist 1058037 rated by user
artist = 1058037
artist_user_rating = user_rating[artist]
artist_user_rating.head()

user_id
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
Name: 1058037, dtype: float64

In [6]:
#applying correlation for the entire user rating dataframe
#Collaborative Filtering Song Recommendation
#Pearson Correlation
recommended_artist = user_rating.corrwith(artist_user_rating)
recommended_artist

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


artist_id
24538     NaN
1000004   NaN
1000006   NaN
1000010   NaN
1000012   NaN
           ..
1101471   NaN
1101613   NaN
1101630   NaN
1101671   NaN
1101719   NaN
Length: 14158, dtype: float64

In [7]:
#removing nan values
recommended_artist_corr = pd.DataFrame(recommended_artist, columns=['Correlation'])
recommended_artist_corr.dropna(inplace=True)

In [8]:
recommended_artist_corr.reset_index()

Unnamed: 0,artist_id,Correlation
0,1000264,0.469095
1,1000335,1.000000
2,1000362,0.775796
3,1000656,1.000000
4,1000903,1.000000
...,...,...
183,1098588,1.000000
184,1098640,1.000000
185,1098924,1.000000
186,1100041,1.000000


In [9]:
#sort the recommended artist correlation
recommended_artist_corr = recommended_artist_corr.sort_values('Correlation', ascending=False).reset_index()
recommended_artist_corr

Unnamed: 0,artist_id,Correlation
0,1017742,1.0
1,1035342,1.0
2,1015911,1.0
3,1040701,1.0
4,1040238,1.0
...,...,...
183,1021912,-1.0
184,1021815,-1.0
185,1049360,-1.0
186,1019576,-1.0


In [10]:
print(type(recommended_artist_corr.head(5)))
artist_list = recommended_artist_corr.head(5)['artist_id'].tolist()
artist_list

<class 'pandas.core.frame.DataFrame'>


[1017742, 1035342, 1015911, 1040701, 1040238]

In [None]:
# TODO:
#filtering still need to be done
# print artist names
# use same concept for user/song/rating
# refactor and cleanup

In [11]:
artist_names = artist_name_data[artist_name_data['artist_id'].isin(artist_list)]
artist_names

Unnamed: 0,artist_id,artist
15486,1015911,Lords Of Acid
17287,1017742,Melvins
34356,1035342,Borknagar
39172,1040238,Mindless Self Indulgence
39633,1040701,Speak No Evil
