# 13-3. 데이터 탐색하기와 전처리

In [27]:
import pandas as pd
import os
pd.set_option('display.max_rows', 10)

### 데이터 준비

In [28]:
fname = "./data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv"
col_names = ['user_id','artist_MBID','artist','play']
data = pd.read_csv(fname,sep='\t',names=col_names)
data.head(2)

Unnamed: 0,user_id,artist_MBID,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099


In [3]:
#사용하는 컬럼만 남기기
using_cols = ['user_id','artist','play']
data = data[using_cols]
data.head(2)

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099


In [4]:
data['artist'] = data['artist'].str.lower()
data.head(2)

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die ärzte,1099


In [5]:
condition = (data['user_id']==data.loc[0,'user_id'])
data.loc[condition]

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
...,...,...,...
44,00000c289a1829a808ac09c00daf10bc3c4e223b,betty,135
45,00000c289a1829a808ac09c00daf10bc3c4e223b,l7,135
46,00000c289a1829a808ac09c00daf10bc3c4e223b,bif naked,134
47,00000c289a1829a808ac09c00daf10bc3c4e223b,girlschool,134


### 데이터 탐색

In [6]:
data['user_id'].nunique()

358868

In [7]:
data['artist'].nunique()

291346

In [8]:
#인기 많은 아티스트
artist_count = data.groupby('artist')['user_id'].count()
artist_count.sort_values(ascending=False).head(30)

artist
radiohead                77254
the beatles              76245
coldplay                 66658
red hot chili peppers    48924
muse                     46954
                         ...  
nine inch nails          28946
sigur rós                28901
green day                28732
massive attack           28691
moby                     28232
Name: user_id, Length: 30, dtype: int64

In [9]:
#유저별 몇 명의 아티스트를 듣고 있는지에 대한 통계
user_count = data.groupby('user_id')['artist'].count()
user_count.describe()

count    358868.000000
mean         48.863234
std           8.524272
min           1.000000
25%          46.000000
50%          49.000000
75%          51.000000
max         166.000000
Name: artist, dtype: float64

In [10]:
user_median = data.groupby('user_id')['play'].median()
user_median.describe()

count    358868.000000
mean        142.187676
std         213.089902
min           1.000000
25%          32.000000
50%          83.000000
75%         180.000000
max       50142.000000
Name: play, dtype: float64

In [11]:
my_favorite = ['black eyed peas' , 'maroon5' ,'jason mraz' ,'coldplay' ,'beyoncé']

my_playlist = pd.DataFrame({'user_id': ['zimin']*5, 'artist': my_favorite, 'play':[30]*5})

if not data.isin({'user_id':['zimin']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    data = data.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

data.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,artist,play
17535650,"sep 20, 2008",turbostaat,12
17535651,"sep 20, 2008",cuba missouri,11
17535652,"sep 20, 2008",little man tate,11
17535653,"sep 20, 2008",sigur rós,10
17535654,"sep 20, 2008",the smiths,10
0,zimin,black eyed peas,30
1,zimin,maroon5,30
2,zimin,jason mraz,30
3,zimin,coldplay,30
4,zimin,beyoncé,30


### 모델에 활용하기 위한 전처리 (실습)

In [12]:
user_unique = data['user_id'].unique()
artist_unique = data['artist'].unique()
print(user_unique)

user_to_idx = {v:k for k,v in enumerate(user_unique)}
artist_to_idx = {v:k for k,v in enumerate(artist_unique)}

['00000c289a1829a808ac09c00daf10bc3c4e223b'
 '00001411dc427966b17297bf4d69e7e193135d89'
 '00004d2ac9316e22dc007ab2243d6fcb239e707d' ...
 'ffff9ef87a7d9494ada2f9ade4b9ff637c0759ac' 'sep 20, 2008' 'zimin']


In [13]:
print(user_to_idx['zimin'])
print(artist_to_idx['black eyed peas'])

358868
376


In [14]:
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()

if len(temp_user_data) == len(data):
    data['user_id'] = temp_user_data
else:
    print("failed!")
    
temp_artist_data = data['artist'].map(artist_to_idx.get).dropna()
if len(temp_artist_data)==len(data):
    data['artist'] = temp_artist_data
else:
    print('artist column indexing Fail!!')
    
data.head(10)

Unnamed: 0,user_id,artist,play
0,0,0,2137
1,0,1,1099
2,0,2,897
3,0,3,717
4,0,4,706
5,0,5,691
6,0,6,545
7,0,7,507
8,0,8,424
9,0,9,403


# 13-4. 사용자의 명시적/암묵적 평가

In [15]:
# 1회만 플레이한 데이터의 비율 -> 0.84%
only_one = data[data['play']<2]
one,all_data = len(only_one),len(data)
print(f'{one},{all_data}')
print(f'Ratio of only_one over all data is {one/all_data:.2%}')

#한 번이라도 들었으면 선호한다고 판단한다.
#많이 재생한 아티스트에 대해 가중치를 주어서 더 확실히 좋아한다고 판단한다.

147740,17535660
Ratio of only_one over all data is 0.84%


# 13-5. Matrix Factorization(MF)

In [16]:
data.head(10)

Unnamed: 0,user_id,artist,play
0,0,0,2137
1,0,1,1099
2,0,2,897
3,0,3,717
4,0,4,706
5,0,5,691
6,0,6,545
7,0,7,507
8,0,8,424
9,0,9,403


# 13-6. CSR(Compressed Sparse Row) Matrix

In [25]:
from scipy.sparse import csr_matrix

num_user = data['user_id'].nunique()
num_artist = data['artist'].nunique()

csr_data = csr_matrix((data.play,(data.user_id,data.artist)),shape=(num_user,num_artist))
csr_data

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

# 13-7. MF 모델 학습하기

In [31]:
from implicit.als import AlternatingLeastSquares
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [35]:
csr_data_transpose = csr_data.T
csr_data_transpose

<291347x358869 sparse matrix of type '<class 'numpy.int64'>'
	with 17535578 stored elements in Compressed Sparse Column format>

In [36]:
#학습 진행
als_model.fit(csr_data_transpose)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [06:20<00:00, 25.36s/it]


In [38]:
zimin,black_eyed_peas = user_to_idx['zimin'], artist_to_idx['black eyed peas']
zimin_vector, black_eyed_peas_vector = als_model.user_factors[zimin],als_model.item_factors[black_eyed_peas]


In [39]:
lll

array([-6.82741225e-01, -4.51749891e-01, -6.68880763e-03,  1.09809160e+00,
        1.24244237e+00,  5.46773709e-02, -1.41059473e-01, -1.22781873e+00,
       -3.36654074e-02,  1.06558251e+00, -1.41561821e-01, -2.80093104e-01,
       -2.31101409e-01, -8.85194719e-01, -1.62519626e-02,  6.09681904e-01,
       -5.68659663e-01, -3.17254007e-01, -1.45243132e+00, -6.77698374e-01,
       -5.12126684e-01, -9.90648866e-02, -1.17330027e+00,  7.97321200e-01,
        2.13634402e-01, -6.27029121e-01,  8.56349766e-01, -3.45457256e-01,
       -1.68307021e-01, -2.29514286e-01,  1.05634224e+00, -7.01098442e-01,
        3.52719814e-01,  1.07494843e+00, -2.44173944e-01, -1.02151036e-01,
        7.49237120e-01, -3.81744355e-01,  6.66681051e-01, -3.77167575e-02,
       -5.44626713e-01,  6.10720694e-01,  1.75165102e-01, -4.92711157e-01,
        3.45343977e-01, -4.29317266e-01,  4.56836581e-01, -8.75505567e-01,
        4.67610747e-01,  1.20897626e-03, -7.78588951e-01, -4.94835764e-01,
        6.50472462e-01, -

In [40]:
np.dot(zimin_vector,black_eyed_peas_vector)

0.5025513

In [42]:
queen = artist_to_idx['queen']
queen_vector = als_model.item_factors[queen]
np.dot(zimin_vector,queen_vector)

0.3154115

# 13-8. 비슷한 아티스트 찾기 + 유저에게 추천하기

### 비슷한 아티스트 찾기

In [43]:
favorite_artist = 'coldplay'
artist_id = artist_to_idx[favorite_artist]
similar_artist = als_model.similar_items(artist_id,N=15)
similar_artist

[(62, 1.0000001),
 (28, 0.98468745),
 (277, 0.9846277),
 (5, 0.9732633),
 (490, 0.9668596),
 (473, 0.9630841),
 (217, 0.9623782),
 (247, 0.95906943),
 (418, 0.95637816),
 (694, 0.94768274),
 (910, 0.9423468),
 (268, 0.94006795),
 (1018, 0.93940294),
 (55, 0.9393958),
 (531, 0.93695796)]

In [44]:
idx_to_artist = {v:k for k,v in artist_to_idx.items()}
[idx_to_artist[i[0]] for i in similar_artist]

['coldplay',
 'the killers',
 'muse',
 'red hot chili peppers',
 'oasis',
 'placebo',
 'radiohead',
 'the beatles',
 'u2',
 'foo fighters',
 'nirvana',
 'pink floyd',
 'the smashing pumpkins',
 'arctic monkeys',
 'depeche mode']

In [45]:
#함수로 만들기 
def get_similar_artist(artist_name:str):
    artist_id = artist_to_idx[artist_name]
    similar_artist = als_model.similar_items(artist_id,N=15)
    similar_artist = [idx_to_artist[i[0]] for i in similar_artist]
    return similar_artist

In [46]:
get_similar_artist('2pac')

['2pac',
 'dr. dre',
 'notorious b.i.g.',
 'nas',
 '50 cent',
 'the game',
 'snoop dogg',
 'jay-z',
 'busta rhymes',
 'gang starr',
 'bone thugs-n-harmony',
 'dmx',
 'common',
 'mobb deep',
 'lil wayne']

### 유저에게 아티스트 추천하기

In [47]:
user = user_to_idx['zimin']
artist_recommended = als_model.recommend(user,csr_data,N=20,filter_already_liked_items=True)
artist_recommended

[(350, 0.4623099),
 (550, 0.44580868),
 (1800, 0.4442072),
 (627, 0.44298732),
 (369, 0.4426257),
 (2249, 0.43213886),
 (354, 0.42232025),
 (274, 0.40859118),
 (564, 0.4058333),
 (355, 0.40429854),
 (901, 0.38799876),
 (5555, 0.38627428),
 (391, 0.38360944),
 (409, 0.37078768),
 (618, 0.35498422),
 (724, 0.3529605),
 (24, 0.34990603),
 (637, 0.34600568),
 (418, 0.3455781),
 (247, 0.3453387)]

In [48]:
recommended_artist = [idx_to_artist[i[0]] for i in artist_recommended]
recommended_artist

['rihanna',
 'britney spears',
 'lady gaga',
 'maroon 5',
 'justin timberlake',
 'katy perry',
 'nelly furtado',
 'michael jackson',
 'kanye west',
 'madonna',
 'pink',
 'timbaland',
 'christina aguilera',
 'amy winehouse',
 'the pussycat dolls',
 'lily allen',
 'jack johnson',
 'akon',
 'u2',
 'the beatles']

In [49]:
rihanna = artist_to_idx['rihanna']
explain = als_model.explain(user,csr_data,itemid=rihanna)

In [50]:
[(idx_to_artist[i[0]], i[1]) for i in explain[1]]

[('beyoncé', 0.23411359444097785),
 ('black eyed peas', 0.1382050014031267),
 ('coldplay', 0.05058591236304576),
 ('jason mraz', 0.04188409251085925),
 ('maroon5', -0.0001019827813230946)]