# AutoEncoder Basic: Model 5

> model_5
- 앞의 총 4개의 모델 중 히든 레이어를 추가하지 않은 버전이 가장 성능이 좋았으므로 model_1에서 파인튜닝 실시
- input_dim = 24666 #len(mfl_col)
- hidden_dim = 64
- dropout = 0.2
- BatchNormalization(input_shape=(hidden_dim,), trainable=False)
- LeakyReLU(alpha=0.01)
- epoch = 10
- batch_size = 64
- loss_function = binary_crossentropy
- learning_rate = 0.0005
- optimizer = adam(learning_rate=learning_rate)

In [1]:
import pickle
import pandas as pd
import os
import io
import json
import distutils.dir_util
import numpy as np

> Data Load

In [7]:
# json write & load 함수 정의
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath(parent)
    with io.open(fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)
        
def load_json(fname):
    with open(fname, encoding='utf-8') as f:
        json_obj = json.load(f)

    return json_obj

In [8]:
with open('../0_data/train_onehot.pkl', 'rb') as f:
    train_onehot = pickle.load(f)

train_onehot.shape

(45824, 24666)

In [9]:
with open('../0_data/test_onehot.pkl', 'rb') as f:
    test_onehot = pickle.load(f)

test_onehot.shape

(11456, 24666)

In [10]:
with open('../0_data/mfl_col.pkl', 'rb') as f:
    mfl_col = pickle.load(f)

len(mfl_col)

24666

In [11]:
song_len = 22798
# song = 22798, tag = 1868

In [12]:
test = pd.read_json('../0_data/test.json')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11456 entries, 0 to 11455
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tags          11456 non-null  object
 1   id            11456 non-null  int64 
 2   plylst_title  11456 non-null  object
 3   songs         11456 non-null  object
 4   like_cnt      11456 non-null  int64 
 5   updt_date     11456 non-null  object
dtypes: int64(2), object(4)
memory usage: 537.1+ KB


> Modeling

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam

In [15]:
# model_5
input_dim = 24666 #len(mfl_col)
hidden_dim = 64
dropout_rate = 0.2

inputs = Input(shape=(input_dim,))
encoded = Dropout(0.2)(inputs)
encoded = Dense(hidden_dim)(encoded)
encoded = BatchNormalization(input_shape=(hidden_dim,), trainable=False)(encoded)
encoded = LeakyReLU(alpha=0.01)(encoded)

decoded = Dense(input_dim,activation='sigmoid')(encoded)
model_5 = Model(inputs, decoded, name='autoencoder')
model_5.summary()

Model: "autoencoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 24666)]           0         
                                                                 
 dropout_3 (Dropout)         (None, 24666)             0         
                                                                 
 dense_6 (Dense)             (None, 64)                1578688   
                                                                 
 batch_normalization_3 (Batc  (None, 64)               256       
 hNormalization)                                                 
                                                                 
 leaky_re_lu_3 (LeakyReLU)   (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 24666)             1603290   
                                                       

In [16]:
learning_rate = 0.0005
optimizer = Adam(learning_rate=learning_rate)
model_5.compile(optimizer=optimizer,loss='binary_crossentropy')

In [17]:
model_5.fit(train_onehot, train_onehot, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1cff3065570>

In [18]:
model_5.save('results/model_5.h5')

In [19]:
model_5 = tf.keras.models.load_model('results/model_5.h5')

In [20]:
predict_plist=model_5.predict(test_onehot)



In [21]:
df_id = list(test['id'])
col= mfl_col
ori_song = col[:song_len]
ori_tag = col[song_len:]

song_predict = predict_plist[:,:song_len] # song output(추천곡)
tag_predict = predict_plist[:,song_len:] # tag output(추천태그)

In [22]:
result=[]
n=0
for i in df_id:
    dic={}
    dic['id']=i

    plist_song=song_predict[n].argsort()[-100:] # predict한 song output 중 상위 100개
    p_song=[]
    for song in plist_song:
        p_song.append(ori_song[song])
    dic['songs']=p_song

    plist_tag=tag_predict[n].argsort()[-10:] # predict한 tag output 중 상위 10개
    p_tag=[]
    for tag in plist_tag:
        p_tag.append(ori_tag[tag])
    dic['tags']=p_tag
    n+=1
    result.append(dic)

In [23]:
write_json(result,'results/result_model_5.json')

---

# 평가

In [24]:
class ArenaEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)
        if len(gt)>100:
            gt = gt[:100]
        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])
        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]
        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate_with_save(self, gt_fname, rec_fname, model_file_path, default_file_path):
        # try:
        music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
        with open(f'{default_file_path}/results.txt','a') as f:
            f.write(model_file_path)
            f.write(f"\nMusic nDCG: {music_ndcg:.6}\n")
            f.write(f"Tag nDCG: {tag_ndcg:.6}\n")
            f.write(f"Score: {score:.6}\n\n")
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        # except Exception as e:
        #     print(e)

    def evaluate(self, gt_fname, rec_fname):
        # try:
        music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
        print(f"Music nDCG: {music_ndcg:.6}")
        print(f"Tag nDCG: {tag_ndcg:.6}")
        print(f"Score: {score:.6}")

In [25]:
gt_fname = '../0_data/test.json'
rec_fname = 'results/result_model_5.json'
arena_evaluator = ArenaEvaluator()
arena_evaluator.evaluate(gt_fname, rec_fname)

Music nDCG: 0.0558455
Tag nDCG: 0.239343
Score: 0.08337
