# 협업 필터링 (Collaborative filtering)

In [55]:
# arena_util.py
# -*- coding: utf-8 -*-

import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np


def write_json(data, fname):

    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("../arena_data/" + parent)
    with io.open("../arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))



Custom evaluating (weak)

In [56]:
# evaluate.py
# -*- coding: utf-8 -*-
# import fire
import numpy as np

# from arena_util import load_json


class CustomEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)


# if __name__ == "__main__":
#     fire.Fire(ArenaEvaluator)


In [57]:
from collections import Counter

import numpy as np
import pandas as pd

import scipy.sparse as spr
import pickle

In [58]:
path = '/Volumes/Seagate Backup Plus Drive/KIE/dataset/melon/'
song_meta = pd.read_json(path + "song_meta.json")
train = pd.read_json(path + "train.json")
test = pd.read_json(path + "val.json")

playlist, song, tag의 id(각각 nid, sid, tid)를 새로 생성하는 이유는, 새로 생성할 id를 matrix의 row, column index로 사용할 것이기 때문입니다.

- plylst_id_nid : playlist id -> nid
- plylst_nid_id : playlist nid -> id
- song_id_sid : song id -> sid
- song_sid_id : song sid -> id
- tag_id_tid : tag id -> tid
- tag_tid_id : tag tid -> id
- song_dict : song id -> count
- tag_dict : tag id -> count

In [59]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

In [60]:
plylst.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1,0
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,1,1
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,1,2
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,1,3
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,1,4


In [61]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
  tag_id_tid[t] = i
  tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
  song_id_sid[t] = i
  song_sid_id[i] = t

n_songs = len(song_dict)

plylst의 songs와 tags를 새로운 id로 변환하여 DataFrame에 추가합니다

In [62]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

In [63]:
plylst.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid,songs_id,tags_id
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0]
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,1,1,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]"
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,1,2,"[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...","[3, 4]"
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,1,3,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...","[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]"
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,1,4,"[127, 128, 129, 130, 131, 132, 133, 134, 135, ...",[15]


In [64]:
plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

In [65]:
plylst_use.head()

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,2013-12-19 18:36:19.000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0],19,1
1,1,2014-12-02 16:19:42.000,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]",42,2
2,1,2017-08-28 07:09:34.000,"[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...","[3, 4]",28,2
3,1,2019-12-05 15:15:18.000,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...","[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]",38,10
4,1,2011-10-25 13:54:56.000,"[127, 128, 129, 130, 131, 132, 133, 134, 135, ...",[15],53,1


In [66]:
plylst_train = plylst_use.iloc[:n_train,:]
plylst_test = plylst_use.iloc[n_train:,:]

test set에서 샘플 300개만 뽑아 테스트해봅니다.

In [76]:
# sample test
np.random.seed(33)
n_sample = 300

test = plylst_test.iloc[np.random.choice(range(n_test), n_sample, replace=False),:]

# real test
test = plylst_test
print(len(test))

23015


In [77]:
test

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
115071,0,2019-05-27 14:14:33.000,"[82770, 73350, 49850, 283466, 38811, 14654, 83...",[],27,0
115072,0,2014-07-16 15:24:24.000,[],[],0,0
115073,0,2008-06-21 23:26:22.000,"[42084, 86991, 615142, 615143, 66432, 191918, ...",[],14,0
115074,0,2017-10-30 18:15:43.000,"[19289, 156274, 92524, 5729, 9179, 4694, 3233,...",[],17,0
115075,0,2017-02-07 11:40:42.000,"[72186, 47442, 47461, 24939, 209259, 81164, 24...",[],8,0
...,...,...,...,...,...,...
138081,0,2015-12-17 14:06:05.000,"[5607, 1025, 9650, 543806, 1424, 7372, 2234, 2...",[4],48,1
138082,0,2020-04-16 21:35:44.000,"[638333, 244876, 108022, 420983, 20258, 595078...","[11913, 335, 3162, 455, 23086]",100,5
138083,0,2019-03-27 15:27:40.000,"[1435, 718, 2659, 2773, 1359, 8731, 696, 697, ...",[],12,0
138084,0,2015-11-18 11:49:09.000,"[3091, 308295, 428975, 80278, 35027, 234993, 8...",[],9,0


row가 playlist(nid)이고 column이 item(sid or tid)인 sparse matrix A를 만듭니다.

In [78]:
row = np.repeat(range(n_train), plylst_train['num_songs'])
col = [song for songs in plylst_train['songs_id'] for song in songs]
dat = np.repeat(1, plylst_train['num_songs'].sum())
train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

row = np.repeat(range(n_train), plylst_train['num_tags'])
col = [tag for tags in plylst_train['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_train['num_tags'].sum())
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

In [79]:
train_songs_A_T = train_songs_A.T.tocsr()
train_tags_A_T = train_tags_A.T.tocsr()

In [80]:
train_songs_A

<115071x638336 sparse matrix of type '<class 'numpy.int64'>'
	with 5285871 stored elements in Compressed Sparse Row format>

In [193]:
from tqdm import tqdm
import os, sys
sys.path.append('..')

from arena_util import most_popular
from arena_util import remove_seen

train = load_json(path + 'train.json')
_, song_mp = most_popular(train, "songs", 100)

def rec(pids):
  tt = 1

  res = []

  for pid in tqdm(pids):
    p = np.zeros((n_songs,1))
    p[test.loc[pid,'songs_id']] = 1

    val = train_songs_A.dot(p).reshape(-1)

    songs_already = test.loc[pid, "songs_id"]
    tags_already = test.loc[pid, "tags_id"]

    cand_song = train_songs_A_T.dot(val)
    cand_song_idx = cand_song.reshape(-1).argsort()[-150:][::-1]

    cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]
    rec_song_idx = [song_sid_id[i] for i in cand_song_idx]
    rec_song_idx += remove_seen(rec_song_idx + songs_already, song_mp)
    rec_song_idx = rec_song_idx[:100]

    cand_tag = train_tags_A_T.dot(val)
    cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

    cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
    rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

    res.append({
                "id": plylst_nid_id[pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx
            })
    
    if tt % 1000 == 0:
      print(tt)

    tt += 1
  return res

In [194]:
answers = rec(test.index)

  4%|▍         | 1002/23015 [01:36<34:24, 10.66it/s]1000
  9%|▊         | 2002/23015 [03:15<33:20, 10.50it/s]2000
 13%|█▎        | 3002/23015 [04:48<31:12, 10.69it/s]3000
 17%|█▋        | 4000/23015 [06:24<31:02, 10.21it/s]4000
 22%|██▏       | 5001/23015 [07:57<27:05, 11.08it/s]5000
 26%|██▌       | 6001/23015 [09:33<27:03, 10.48it/s]6000
 30%|███       | 7002/23015 [11:08<24:21, 10.96it/s]7000
 35%|███▍      | 8002/23015 [12:45<23:16, 10.75it/s]8000
 39%|███▉      | 9002/23015 [14:20<22:03, 10.59it/s]9000
 43%|████▎     | 10001/23015 [15:54<21:36, 10.04it/s]10000
 48%|████▊     | 11002/23015 [17:30<19:11, 10.43it/s]11000
 52%|█████▏    | 12001/23015 [19:07<17:37, 10.41it/s]12000
 56%|█████▋    | 13001/23015 [20:40<19:10,  8.71it/s]13000
 61%|██████    | 14000/23015 [22:16<14:56, 10.05it/s]14000
 65%|██████▌   | 15002/23015 [23:54<13:24,  9.95it/s]15000
 70%|██████▉   | 16001/23015 [25:32<10:34, 11.05it/s]16000
 74%|███████▍  | 17002/23015 [27:04<09:07, 10.98it/s]17000
 78%|███████▊  

In [195]:

write_json(answers, "results/results.json")

In [128]:
tmp = answers

In [141]:
for idx, answer in enumerate(tmp):
    answer = answer['songs'] + song_mp
    tmp[idx]['songs'] = answer[:100]

TypeError: list indices must be integers or slices, not str

In [135]:
write_json(answers, "results/results.json")

In [54]:
evaluator = CustomEvaluator()
evaluator.evaluate("../arena_data/answers/val.json", "../arena_data/results/results.json")

28228


In [154]:
a = pd.read_json("../arena_data/results/tmp_results.json")

In [184]:
arr = []
for i in range(len(a)):
    tmp = a.loc[i]['songs'][0]
    a.loc[i]['songs'] = list(set(tmp))

    

In [189]:
a['songs'] = a['songs'].apply(lambda x : list(set(x[0]))[:100])

In [192]:
a.to_json("../arena_data/results/results.json", orient='records')

In [191]:
print(len(a.loc[0]['songs']))

100


In [190]:
a

Unnamed: 0,id,songs,tags
0,118598,"[585728, 169984, 422915, 21512, 476680, 680970...","[OST, 기분전환, 디즈니, 애니메이션, 영화, 팝, 겨울, 추억, 휴식, 힐링]"
1,131447,"[169984, 422915, 434694, 341513, 639501, 46615...","[지미맥길, 장르음악, r_lum_r, honne, 네오라인, NAO, 국힙알엔비,..."
2,51464,"[585216, 442368, 169984, 422915, 687110, 47258...","[발라드, 추억, 회상, 이별, 슬픔, 설렘, 사랑, 잔잔한, 싸이월드, 가을]"
3,45144,"[169984, 683520, 422915, 118788, 628232, 34151...","[발라드, 감성, 이별, 기분전환, 사랑, 카페, 드라이브, 잔잔한, 새벽, 휴식]"
4,79929,"[412672, 169984, 642051, 494083, 342021, 66919...","[CCM, 찬양, 은혜, 사랑, 예배, 국내ccm, 은혜로운, 위로, 찬송가, 교회]"
...,...,...,...
23010,101722,"[169984, 422915, 341513, 133143, 261659, 39682...","[발라드, 기분전환, 감성, 사랑, 이별, 새벽, 휴식, 밤, 추억, 드라이브]"
23011,122127,"[169984, 422915, 413189, 341513, 549392, 13314...","[힐링, 휴식, 기분전환, 감성, 잔잔한, 발라드, 새벽, 사랑, 밤, 추억]"
23012,77438,"[169984, 422915, 380423, 476680, 341513, 67943...","[기분전환, 팝, 드라이브, 휴식, 힐링, 감성, Pop, 잔잔한, 팝송, 새벽]"
23013,36231,"[169984, 422915, 166405, 525830, 269830, 34151...","[클래식, 휴식, 힐링, 잔잔한, 팝, 추억, 기분전환, 올드팝, 감성, 피아노]"
