In [1]:
import pickle
import pandas as pd
import os
import io
import json
import distutils.dir_util
import numpy as np

# 평가

> Data Load

In [2]:
# json write & load 함수 정의
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath(parent)
    with io.open(fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)
        
def load_json(fname):
    with open(fname, encoding='utf-8') as f:
        json_obj = json.load(f)

    return json_obj

In [4]:
with open('../0_data/mfl_col.pkl', 'rb') as f:
    mfl_col = pickle.load(f)

len(mfl_col)

24666

In [5]:
song_len = 22798
# song = 22798, tag = 1868

In [8]:
q_test = pd.read_json('../0_data/q_test.json')
q_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11456 entries, 0 to 11455
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tags          11456 non-null  object
 1   id            11456 non-null  int64 
 2   plylst_title  11456 non-null  object
 3   songs         11456 non-null  object
 4   like_cnt      11456 non-null  int64 
 5   updt_date     11456 non-null  object
dtypes: int64(2), object(4)
memory usage: 537.1+ KB


In [9]:
q_test.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[],110887,발라드 미디엄 위주의 경쾌 발라드,"[389159, 226331, 597375, 586653, 613020, 50379...",14,2016-07-07 21:20:09.000
1,"[Pop, 힐링, 기분전환, 퇴근길]",113079,지치고 고된 하루끝 퇴근시간에 듣기 좋은 POP,[],339,2020-03-23 20:03:51.000
2,[],80316,달빛 비추는 밤 나의 감성을 자극할 노래,"[413189, 47106, 317362, 63533, 422807, 342803,...",567,2019-12-02 18:15:25.000
3,[슬픔],32338,추운 겨울 아무 이유없이 땡기는 노래,[],22,2010-11-22 22:33:54.000
4,"[휴일, 취향저격DJ]",91698,홈캉스 필수템 우아한 트렌디 POP,[],67,2018-08-08 12:23:09.000


> one-hot encoding

In [7]:
# 각 플레이리스트에 해당하는 곡, 태그 좌표에 1 부여

def create_onehot(df,column_name):
    zero_matrix=np.zeros((len(df),len(column_name)))
    zero_df=pd.DataFrame(zero_matrix,columns=column_name,index=df['id'])
    for i in range(len(df)):
        for j in df.iloc[i,0]+df.iloc[i,3] :
            if j in column_name:
                zero_df.iloc[i,column_name.index(j)]=1
    return zero_df

In [10]:
# create_onehot 함수 활용 test data >> onehot encoding
q_test_onehot = create_onehot(q_test, mfl_col)
q_test_onehot.shape

(11456, 24666)

In [11]:
with open('../0_data/q_test_onehot.pkl', 'wb') as f:
    pickle.dump(q_test_onehot, f)

In [12]:
with open('../0_data/q_test_onehot.pkl', 'rb') as f:
    q_test_onehot = pickle.load(f)

q_test_onehot.shape

(11456, 24666)

> predict

In [13]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam

In [14]:
model_7 = tf.keras.models.load_model('results/model_7.h5')

In [15]:
predict_plist=model_7.predict(q_test_onehot)



In [16]:
df_id = list(q_test['id'])
col= mfl_col
ori_song = col[:song_len]
ori_tag = col[song_len:]

song_predict = predict_plist[:,:song_len] # song output(추천곡)
tag_predict = predict_plist[:,song_len:] # tag output(추천태그)

In [17]:
result=[]
n=0
for i in df_id:
    dic={}
    dic['id']=i

    plist_song=song_predict[n].argsort()[-100:] # predict한 song output 중 상위 100개
    p_song=[]
    for song in plist_song:
        p_song.append(ori_song[song])
    dic['songs']=p_song

    plist_tag=tag_predict[n].argsort()[-10:] # predict한 tag output 중 상위 10개
    p_tag=[]
    for tag in plist_tag:
        p_tag.append(ori_tag[tag])
    dic['tags']=p_tag
    n+=1
    result.append(dic)

In [18]:
write_json(result,'results/result_model_7_q.json')

---

# 평가

In [19]:
class ArenaEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)
        if len(gt)>100:
            gt = gt[:100]
        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])
        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]
        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate_with_save(self, gt_fname, rec_fname, model_file_path, default_file_path):
        # try:
        music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
        with open(f'{default_file_path}/results.txt','a') as f:
            f.write(model_file_path)
            f.write(f"\nMusic nDCG: {music_ndcg:.6}\n")
            f.write(f"Tag nDCG: {tag_ndcg:.6}\n")
            f.write(f"Score: {score:.6}\n\n")
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        # except Exception as e:
        #     print(e)

    def evaluate(self, gt_fname, rec_fname):
        # try:
        music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
        print(f"Music nDCG: {music_ndcg:.6}")
        print(f"Tag nDCG: {tag_ndcg:.6}")
        print(f"Score: {score:.6}")

In [20]:
gt_fname = '../0_data/a_test.json'
rec_fname = 'results/result_model_7_q.json'
arena_evaluator = ArenaEvaluator()
arena_evaluator.evaluate(gt_fname, rec_fname)

Music nDCG: 0.0685702
Tag nDCG: 0.20947
Score: 0.0897052


> rerank 후 평가

In [22]:
with open('../0_data/count_song.pkl', 'rb') as f:
    count_song = pickle.load(f)

len(count_song)

44674

In [21]:
with open('../0_data/count_tag.pkl', 'rb') as f:
    count_tag = pickle.load(f)

len(count_tag)

3400

In [25]:
results = pd.read_json('./results/result_model_7_q.json', typ = 'frame', encoding='utf-8')
results.head()

Unnamed: 0,id,songs,tags
0,110887,"[498452, 253755, 610933, 461476, 216696, 17391...","[가을, 휴식, 힐링, 드라이브, 아이돌, 잔잔한, 밤, 매장음악, 발라드, 기분전환]"
1,113079,"[422091, 191915, 512148, 40937, 349727, 263588...","[추억, 잔잔한, 명곡, 신나는, 팝송, 감성, Pop, 휴식, 힐링, 기분전환]"
2,80316,"[305045, 339004, 34548, 173912, 24016, 185174,...","[겨울, 댄스, 잔잔한, 가을, 드라이브, 새벽, 여름, 기분전환, 밤, 발라드]"
3,32338,"[465905, 351342, 164371, 643070, 314149, 60838...","[드라이브, 댄스, 밤, 인디, 감성, 사랑, 발라드, 비오는날, 이별, 슬픔]"
4,91698,"[205910, 449244, 484234, 610933, 231334, 21343...","[드라이브, 인디, 일렉, 비오는날, 락, 매장음악, 댄스, 팝, 발라드, 기분전환]"


In [26]:
pl_ids = results['id']
p_songs = results['songs']
p_tags = results['tags']

len(pl_ids), len(p_songs), len(p_songs)

(11456, 11456, 11456)

In [27]:
p_songs_reranked = []
for p_song in p_songs :
    score = []
    for song in p_song :
        score.append(count_song[song])
    p_song_reranked = np.array(p_song)[np.array(score).argsort()].tolist()
    p_songs_reranked.append(p_song_reranked)
len(p_songs_reranked)

11456

In [28]:
p_tags_reranked = []
for p_tag in p_tags :
    score = []
    for tag in p_tag :
        score.append(count_tag[tag])
    p_tag_reranked = np.array(p_tag)[np.array(score).argsort()].tolist()
    p_tags_reranked.append(p_tag_reranked)
len(p_tags_reranked)

11456

In [29]:
result=[]
for i in range(len(pl_ids)) :
    dic={}
    dic['id']=pl_ids[i]
    dic['songs']=p_songs_reranked[i]
    dic['tags']=p_tags_reranked[i]
    result.append(dic)

In [32]:
write_json(result,'./results/result_model_7_q_rrk.json')

In [33]:
result = load_json('./results/result_model_7_q_rrk.json')

In [34]:
gt_fname = '../0_data/a_test.json'
rec_fname = 'results/result_model_7_q_rrk.json'
arena_evaluator = ArenaEvaluator()
arena_evaluator.evaluate(gt_fname, rec_fname)

Music nDCG: 0.0794174
Tag nDCG: 0.232161
Score: 0.102329
