<a href="https://colab.research.google.com/github/s2ul2/melon_playlist_rec/blob/main/collabo_upload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/비타민/복습프로젝트

/content/drive/MyDrive/비타민/복습프로젝트


# 협업 필터링 (Collaborative filtering)

In [3]:
# arena_util.py
# -*- coding: utf-8 -*-

import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np


def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))



Custom evaluating (weak)

In [4]:
# evaluate.py
# -*- coding: utf-8 -*-
# import fire
import numpy as np

# from arena_util import load_json


class CustomEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)


# if __name__ == "__main__":
#     fire.Fire(ArenaEvaluator)


In [5]:
from collections import Counter

import numpy as np
import pandas as pd

import scipy.sparse as spr
import pickle

In [6]:
song_meta = pd.read_json("song_meta.json")
train = pd.read_json("train.json")
test = pd.read_json("val.json")

playlist, song, tag의 id(각각 nid, sid, tid)를 새로 생성하는 이유는, 새로 생성할 id를 matrix의 row, column index로 사용할 것이기 때문입니다.

- plylst_id_nid : playlist id -> nid
- plylst_nid_id : playlist nid -> id
- song_id_sid : song id -> sid
- song_sid_id : song sid -> id
- tag_id_tid : tag id -> tid
- tag_tid_id : tag tid -> id
- song_dict : song id -> count
- tag_dict : tag id -> count

In [7]:
train['istrain'] = 1
test['istrain'] = 0

In [8]:
n_train = len(train)
n_test = len(test)

In [9]:
n_test

23015

In [10]:
# train + test
plylst = pd.concat([train, test], ignore_index=True)

In [11]:
plylst.shape

(138086, 7)

## 플레이리스트 아이디

In [12]:
# playlist id
plylst["nid"] = range(n_train + n_test)

In [13]:
plylst.tail(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid
138083,[],77438,,"[625875, 464051, 11657, 236393, 358186, 213435...",0,2019-03-27 15:27:40.000,0,138083
138084,[],36231,,"[161094, 665833, 688145, 432735, 439938, 12665...",31,2015-11-18 11:49:09.000,0,138084
138085,[],65189,,"[643070, 132994, 98223, 293236, 513129, 650494...",19,2017-04-23 16:50:58.000,0,138085


In [14]:
# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

In [15]:
plylst_id_nid

{61281: 0,
 10532: 1,
 76951: 2,
 147456: 3,
 27616: 4,
 69252: 5,
 45339: 6,
 36557: 7,
 70741: 8,
 10288: 9,
 31804: 10,
 151693: 11,
 112060: 12,
 89809: 13,
 131092: 14,
 8316: 15,
 1516: 16,
 5348: 17,
 75342: 18,
 88143: 19,
 24126: 20,
 12460: 21,
 100613: 22,
 62696: 23,
 122843: 24,
 6143: 25,
 136677: 26,
 144318: 27,
 93836: 28,
 106145: 29,
 24779: 30,
 93275: 31,
 144591: 32,
 1239: 33,
 53737: 34,
 125194: 35,
 107911: 36,
 135100: 37,
 74655: 38,
 43665: 39,
 35178: 40,
 105328: 41,
 131098: 42,
 109678: 43,
 69903: 44,
 14674: 45,
 46905: 46,
 92257: 47,
 61024: 48,
 82383: 49,
 111617: 50,
 63703: 51,
 28153: 52,
 13100: 53,
 134542: 54,
 137737: 55,
 116573: 56,
 67565: 57,
 16037: 58,
 137153: 59,
 46684: 60,
 33019: 61,
 53375: 62,
 124665: 63,
 45163: 64,
 118405: 65,
 67808: 66,
 139038: 67,
 57360: 68,
 26774: 69,
 12956: 70,
 149384: 71,
 131376: 72,
 132680: 73,
 127671: 74,
 141483: 75,
 97458: 76,
 86409: 77,
 23075: 78,
 140397: 79,
 94227: 80,
 140918: 81,


In [16]:
plylst_nid_id

{0: 61281,
 1: 10532,
 2: 76951,
 3: 147456,
 4: 27616,
 5: 69252,
 6: 45339,
 7: 36557,
 8: 70741,
 9: 10288,
 10: 31804,
 11: 151693,
 12: 112060,
 13: 89809,
 14: 131092,
 15: 8316,
 16: 1516,
 17: 5348,
 18: 75342,
 19: 88143,
 20: 24126,
 21: 12460,
 22: 100613,
 23: 62696,
 24: 122843,
 25: 6143,
 26: 136677,
 27: 144318,
 28: 93836,
 29: 106145,
 30: 24779,
 31: 93275,
 32: 144591,
 33: 1239,
 34: 53737,
 35: 125194,
 36: 107911,
 37: 135100,
 38: 74655,
 39: 43665,
 40: 35178,
 41: 105328,
 42: 131098,
 43: 109678,
 44: 69903,
 45: 14674,
 46: 46905,
 47: 92257,
 48: 61024,
 49: 82383,
 50: 111617,
 51: 63703,
 52: 28153,
 53: 13100,
 54: 134542,
 55: 137737,
 56: 116573,
 57: 67565,
 58: 16037,
 59: 137153,
 60: 46684,
 61: 33019,
 62: 53375,
 63: 124665,
 64: 45163,
 65: 118405,
 66: 67808,
 67: 139038,
 68: 57360,
 69: 26774,
 70: 12956,
 71: 149384,
 72: 131376,
 73: 132680,
 74: 127671,
 75: 141483,
 76: 97458,
 77: 86409,
 78: 23075,
 79: 140397,
 80: 94227,
 81: 140918,


## 태그

In [17]:
plylst['tags']

0                                                       [락]
1                                                  [추억, 회상]
2                                                 [까페, 잔잔한]
3         [연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...
4                                                      [댄스]
                                ...                        
138081                                                [잔잔한]
138082                          [어머니, 힘들때, 아빠, 가족, 위로받고싶을때]
138083                                                   []
138084                                                   []
138085                                                   []
Name: tags, Length: 138086, dtype: object

In [18]:
plylst_tag = plylst['tags']

In [19]:
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs]) # 하나의 플레이리스트에 태그가 여러개 존재하는 경우도 있으므로 이중 for문을 돌려 하나의 태그(tg)를 뽑음
# Counter를 통해 tag 개수 세기

In [20]:
tag_counter

Counter({'락': 4076,
         '추억': 6913,
         '회상': 4864,
         '까페': 3012,
         '잔잔한': 10782,
         '연말': 497,
         '눈오는날': 110,
         '캐럴': 81,
         '분위기': 2639,
         '따듯한': 117,
         '크리스마스캐럴': 7,
         '겨울노래': 221,
         '크리스마스': 998,
         '겨울왕국': 35,
         '크리스마스송': 7,
         '댄스': 3402,
         '운동': 2810,
         '드라이브': 10725,
         'Pop': 3435,
         '트로피컬하우스': 76,
         '힐링': 10344,
         '기분전환': 17421,
         '2017': 110,
         '팝': 6623,
         '트렌드': 147,
         '일렉': 2268,
         '짝사랑': 291,
         '취향저격': 1446,
         '슬픔': 3750,
         '고백': 523,
         '사랑': 9205,
         '이별': 5716,
         '일렉트로니카': 1052,
         '포크': 375,
         '메탈': 345,
         '인디': 4739,
         '록': 828,
         'Metal': 103,
         '이일우': 1,
         'M에센셜': 131,
         'Rock': 669,
         'kpop': 326,
         '걸그룹댄스': 13,
         '스트레스해소': 222,
         '새해': 198,
         '여행': 4806,
         '

In [21]:
# 위에서 만든 counter 자료형을 dict형으로 변환
tag_dict = {x: tag_counter[x] for x in tag_counter}

In [22]:
tag_dict

{'락': 4076,
 '추억': 6913,
 '회상': 4864,
 '까페': 3012,
 '잔잔한': 10782,
 '연말': 497,
 '눈오는날': 110,
 '캐럴': 81,
 '분위기': 2639,
 '따듯한': 117,
 '크리스마스캐럴': 7,
 '겨울노래': 221,
 '크리스마스': 998,
 '겨울왕국': 35,
 '크리스마스송': 7,
 '댄스': 3402,
 '운동': 2810,
 '드라이브': 10725,
 'Pop': 3435,
 '트로피컬하우스': 76,
 '힐링': 10344,
 '기분전환': 17421,
 '2017': 110,
 '팝': 6623,
 '트렌드': 147,
 '일렉': 2268,
 '짝사랑': 291,
 '취향저격': 1446,
 '슬픔': 3750,
 '고백': 523,
 '사랑': 9205,
 '이별': 5716,
 '일렉트로니카': 1052,
 '포크': 375,
 '메탈': 345,
 '인디': 4739,
 '록': 828,
 'Metal': 103,
 '이일우': 1,
 'M에센셜': 131,
 'Rock': 669,
 'kpop': 326,
 '걸그룹댄스': 13,
 '스트레스해소': 222,
 '새해': 198,
 '여행': 4806,
 '프로필음악': 4,
 '카카오톡': 14,
 '소원': 10,
 '프로필': 9,
 '소망': 7,
 '다짐': 10,
 '카톡': 8,
 '듣고': 5,
 '우울': 1205,
 '이거': 1,
 '힘내': 137,
 '힙합': 6830,
 '느낌있는': 845,
 '밤': 8059,
 '새벽': 8793,
 'RnB': 1071,
 '감각적인': 926,
 '국내': 288,
 '그루브한': 46,
 '가을': 4257,
 '재즈': 3332,
 '감성': 12105,
 '질리지않는': 19,
 '나만알고싶은': 156,
 '봄': 3604,
 '설렘': 5189,
 '비오는날': 3150,
 '누군가생각날때': 1,
 '스밍': 13,
 '목록': 3,
 '폐

In [23]:
for i, t in enumerate(tag_dict):
    print(i,t)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
25197 그리움_안녕
25198 잘지내길바래
25199 브리트니비치
25200 고음발라드
25201 폭풍오열
25202 Seohyun
25203 서현콘서트
25204 LoveStill
25205 현지트
25206 쉬운영어
25207 교육
25208 영어교육
25209 LowlyPalace
25210 방탄소년단수록곡
25211 ToniBraxton
25212 팝알앤비
25213 음악파트너
25214 토니블랙스톤
25215 Dojacat
25216 초대
25217 work
25218 캐슬린페리어
25219 인생역전
25220 KathleenFerrier
25221 오늘선곡
25222 네오_소울
25223 yasisi
25224 2월Yasisi인디뮤직
25225 촬영음악
25226 뮤직비디오음악
25227 머리_흔들어_재껴
25228 해드뱅잉
25229 중독성_갑
25230 3flow
25231 와인펍
25232 쓰리플로우
25233 신논현역
25234 북상하는장마전선
25235 시작은언제나비
25236 쏟아지는비가그치면떠나자
25237 무명시절
25238 유명가수
25239 준니
25240 다프트_펑크
25241 레드_핫_칠리_페퍼스
25242 사랑해봅시다
25243 올드앤뉴
25244 해피쏭
25245 올댓스케이트
25246 스케이트
25247 프라임보이
25248 마지막독백
25249 친구의죽음
25250 오디오게임
25251 deepmoood
25252 아메리칸송북
25253 백전노장
25254 5월하늘에
25255 방탄제이홉
25256 정호석
25257 노래방추천
25258 별빛이내린다
25259 더뮤지션pick한곡모음
25260 하프시코드
25261 외국힙합_1
25262 외국힙합_2
25263 남자_노래방
25264 붕어빵
25265 파운드케익
25266 크로스오버재즈
25267 thelonelyisland
25268 왜안들어
25269 절대들어
25270 천상의

In [24]:
tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
  tag_id_tid[t] = i
  tag_tid_id[i] = t

In [25]:
tag_tid_id

{0: '락',
 1: '추억',
 2: '회상',
 3: '까페',
 4: '잔잔한',
 5: '연말',
 6: '눈오는날',
 7: '캐럴',
 8: '분위기',
 9: '따듯한',
 10: '크리스마스캐럴',
 11: '겨울노래',
 12: '크리스마스',
 13: '겨울왕국',
 14: '크리스마스송',
 15: '댄스',
 16: '운동',
 17: '드라이브',
 18: 'Pop',
 19: '트로피컬하우스',
 20: '힐링',
 21: '기분전환',
 22: '2017',
 23: '팝',
 24: '트렌드',
 25: '일렉',
 26: '짝사랑',
 27: '취향저격',
 28: '슬픔',
 29: '고백',
 30: '사랑',
 31: '이별',
 32: '일렉트로니카',
 33: '포크',
 34: '메탈',
 35: '인디',
 36: '록',
 37: 'Metal',
 38: '이일우',
 39: 'M에센셜',
 40: 'Rock',
 41: 'kpop',
 42: '걸그룹댄스',
 43: '스트레스해소',
 44: '새해',
 45: '여행',
 46: '프로필음악',
 47: '카카오톡',
 48: '소원',
 49: '프로필',
 50: '소망',
 51: '다짐',
 52: '카톡',
 53: '듣고',
 54: '우울',
 55: '이거',
 56: '힘내',
 57: '힙합',
 58: '느낌있는',
 59: '밤',
 60: '새벽',
 61: 'RnB',
 62: '감각적인',
 63: '국내',
 64: '그루브한',
 65: '가을',
 66: '재즈',
 67: '감성',
 68: '질리지않는',
 69: '나만알고싶은',
 70: '봄',
 71: '설렘',
 72: '비오는날',
 73: '누군가생각날때',
 74: '스밍',
 75: '목록',
 76: '폐막식',
 77: '올림픽',
 78: '엑소',
 79: '조용히',
 80: '혼자',
 81: '또는',
 82: '새벽감성',
 83: '고민',
 

In [26]:
n_tags = len(tag_dict)

## 노래

In [27]:
plylst_song = plylst['songs']

In [28]:
plylst_song

0         [525514, 129701, 383374, 562083, 297861, 13954...
1         [432406, 675945, 497066, 120377, 389529, 24427...
2         [83116, 276692, 166267, 186301, 354465, 256598...
3         [394031, 195524, 540149, 287984, 440773, 10033...
4         [159327, 553610, 5130, 645103, 294435, 100657,...
                                ...                        
138081    [75842, 26083, 244183, 684715, 500593, 508608,...
138082    [450275, 487671, 561031, 663944, 628672, 59121...
138083    [625875, 464051, 11657, 236393, 358186, 213435...
138084    [161094, 665833, 688145, 432735, 439938, 12665...
138085    [643070, 132994, 98223, 293236, 513129, 650494...
Name: songs, Length: 138086, dtype: object

In [29]:
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])

In [30]:
# song_dict : 노래 id와 해당 노래의 빈도수
song_dict = {x: song_counter[x] for x in song_counter}
song_dict

{525514: 8,
 129701: 3,
 383374: 1,
 562083: 15,
 297861: 58,
 139541: 25,
 351214: 2,
 650298: 50,
 531057: 42,
 205238: 4,
 706183: 1,
 127099: 1,
 660493: 4,
 461973: 7,
 121455: 1,
 72552: 64,
 223955: 2,
 324992: 1,
 50104: 3,
 432406: 56,
 675945: 51,
 497066: 1188,
 120377: 18,
 389529: 342,
 244277: 42,
 461062: 142,
 696302: 37,
 442765: 19,
 532114: 931,
 586541: 451,
 33389: 225,
 244000: 109,
 692078: 19,
 37741: 31,
 645653: 41,
 571802: 50,
 200183: 33,
 61435: 24,
 204499: 33,
 41749: 13,
 129258: 12,
 413920: 9,
 117205: 3,
 6546: 1201,
 152422: 1327,
 602724: 117,
 425946: 217,
 173634: 79,
 631268: 2,
 409869: 1,
 97749: 2,
 395416: 418,
 103741: 6,
 181101: 9,
 472144: 7,
 414721: 25,
 75801: 28,
 315216: 27,
 192882: 44,
 383960: 273,
 548636: 88,
 83116: 199,
 276692: 93,
 166267: 391,
 186301: 18,
 354465: 42,
 256598: 23,
 233195: 9,
 666852: 108,
 686560: 23,
 556426: 14,
 142974: 262,
 331878: 79,
 195141: 52,
 32017: 46,
 617795: 31,
 396532: 24,
 623704: 30,


In [31]:
song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
  song_id_sid[t] = i
  song_sid_id[i] = t

In [32]:
song_id_sid

{525514: 0,
 129701: 1,
 383374: 2,
 562083: 3,
 297861: 4,
 139541: 5,
 351214: 6,
 650298: 7,
 531057: 8,
 205238: 9,
 706183: 10,
 127099: 11,
 660493: 12,
 461973: 13,
 121455: 14,
 72552: 15,
 223955: 16,
 324992: 17,
 50104: 18,
 432406: 19,
 675945: 20,
 497066: 21,
 120377: 22,
 389529: 23,
 244277: 24,
 461062: 25,
 696302: 26,
 442765: 27,
 532114: 28,
 586541: 29,
 33389: 30,
 244000: 31,
 692078: 32,
 37741: 33,
 645653: 34,
 571802: 35,
 200183: 36,
 61435: 37,
 204499: 38,
 41749: 39,
 129258: 40,
 413920: 41,
 117205: 42,
 6546: 43,
 152422: 44,
 602724: 45,
 425946: 46,
 173634: 47,
 631268: 48,
 409869: 49,
 97749: 50,
 395416: 51,
 103741: 52,
 181101: 53,
 472144: 54,
 414721: 55,
 75801: 56,
 315216: 57,
 192882: 58,
 383960: 59,
 548636: 60,
 83116: 61,
 276692: 62,
 166267: 63,
 186301: 64,
 354465: 65,
 256598: 66,
 233195: 67,
 666852: 68,
 686560: 69,
 556426: 70,
 142974: 71,
 331878: 72,
 195141: 73,
 32017: 74,
 617795: 75,
 396532: 76,
 623704: 77,
 516930:

In [33]:
n_songs = len(song_dict)

plylst의 songs와 tags를 새로운 id로 변환하여 DataFrame에 추가합니다

In [34]:
plylst.head(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1,0
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,1,1
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,1,2


In [35]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

In [36]:
plylst.tail(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid,songs_id,tags_id
138083,[],77438,,"[625875, 464051, 11657, 236393, 358186, 213435...",0,2019-03-27 15:27:40.000,0,138083,"[1435, 718, 2659, 2773, 1359, 8731, 696, 697, ...",[]
138084,[],36231,,"[161094, 665833, 688145, 432735, 439938, 12665...",31,2015-11-18 11:49:09.000,0,138084,"[3091, 308295, 428975, 80278, 35027, 234993, 8...",[]
138085,[],65189,,"[643070, 132994, 98223, 293236, 513129, 650494...",19,2017-04-23 16:50:58.000,0,138085,"[13816, 3775, 10500, 17947, 8519, 3783, 3795, ...",[]


In [37]:
# num_songs(플리의 노래 수), num_tags(플리의 태그 수) 컬럼 만들기
plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,istrain,nid,updt_date,songs_id,tags_id,num_songs,num_tags
0,1,0,2013-12-19 18:36:19.000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0],19,1
1,1,1,2014-12-02 16:19:42.000,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]",42,2
2,1,2,2017-08-28 07:09:34.000,"[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...","[3, 4]",28,2
3,1,3,2019-12-05 15:15:18.000,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...","[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]",38,10
4,1,4,2011-10-25 13:54:56.000,"[127, 128, 129, 130, 131, 132, 133, 134, 135, ...",[15],53,1
...,...,...,...,...,...,...,...
138081,0,138081,2015-12-17 14:06:05.000,"[5607, 1025, 9650, 543806, 1424, 7372, 2234, 2...",[4],48,1
138082,0,138082,2020-04-16 21:35:44.000,"[638333, 244876, 108022, 420983, 20258, 595078...","[11913, 335, 3162, 455, 23086]",100,5
138083,0,138083,2019-03-27 15:27:40.000,"[1435, 718, 2659, 2773, 1359, 8731, 696, 697, ...",[],12,0
138084,0,138084,2015-11-18 11:49:09.000,"[3091, 308295, 428975, 80278, 35027, 234993, 8...",[],9,0


In [38]:
plylst_use = plylst_use.set_index('nid')
plylst_use.head(2)

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,2013-12-19 18:36:19.000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0],19,1
1,1,2014-12-02 16:19:42.000,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]",42,2


In [39]:
plylst_train = plylst_use.iloc[:n_train,:]
plylst_test = plylst_use.iloc[n_train:,:]

test set에서 랜덤으로 샘플 300개만 뽑아 테스트해봅니다.

In [40]:
# sample test
np.random.seed(33)
n_sample = 300

test = plylst_test.iloc[np.random.choice(range(n_test), n_sample, replace=False),:]

# real test
# test = plylst_test
# print(len(test))

In [41]:
test

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
124803,0,2010-04-11 10:50:25.000,"[625209, 479246, 463289, 556519, 240419, 43638...",[],19,0
122133,0,2014-11-25 17:42:03.000,"[121068, 18863, 917, 263984, 148786, 7404, 190...",[],26,0
126073,0,2017-09-15 20:56:11.000,"[40061, 76488, 5950, 24720, 3305, 34443, 5904,...",[15529],35,1
134537,0,2014-05-15 16:26:42.000,"[10835, 48733, 42865, 42291, 5026, 5154, 15750...","[20, 31]",12,2
117942,0,2018-06-30 14:16:20.000,"[424959, 13187, 218305, 33287, 83254, 12968, 6...",[],13,0
...,...,...,...,...,...,...
126719,0,2018-05-29 09:49:05.000,[],"[110, 54, 1242, 83]",0,4
132146,0,2013-12-07 08:26:52.000,"[16360, 103980, 103073, 9216, 683, 53047, 1154...",[],9,0
125393,0,2016-07-18 17:52:43.000,"[124498, 72312, 29225, 2969, 74082, 2072, 1221...","[5097, 29629, 7146]",13,3
128840,0,2013-01-13 23:12:06.000,"[29688, 529981, 64366, 629382, 629383, 276816,...",[],30,0


### train_songs_A, train_tags_A 만들기 
* row가 playlist(nid)이고 column이 item(sid or tid)인 sparse matrix A를 만듭니다.
* sparse matrix는 주로 scipy의 CSR로 저장한다.


In [42]:
plylst_train['num_songs']

nid
0          19
1          42
2          28
3          38
4          53
         ... 
115066     12
115067     11
115068     11
115069     55
115070    200
Name: num_songs, Length: 115071, dtype: int64

In [43]:
plylst_train['num_songs'].sum()

5285871

In [44]:
n_train

115071

* np.repeat(반복할 값, 몇번 반복)
* 두번째 인자가 아래처럼 리스트일 경우 원소별로 반복할 횟수를 다르게 지정한다.
* 위에서 plylst_train['num_songs']값은 [19, 42, 28, 38, 53, ,,,]인데 * 이는 즉 첫번째 원소인 0은 19번 반복, 두번째 원소인 1은 42번 반복 을 의미한다.

In [45]:
row = np.repeat(range(n_train), plylst_train['num_songs'])
row

array([     0,      0,      0, ..., 115070, 115070, 115070])

In [46]:
plylst_train['songs_id']

nid
0         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
1         [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...
2         [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...
3         [89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...
4         [127, 128, 129, 130, 131, 132, 133, 134, 135, ...
                                ...                        
115066    [156225, 119784, 144788, 28312, 615129, 321240...
115067    [148864, 143636, 311523, 266259, 8989, 389684,...
115068    [70287, 13559, 7387, 140158, 168013, 168422, 1...
115069    [123250, 92055, 170131, 86695, 54428, 120047, ...
115070    [16335, 16267, 26291, 22614, 30836, 30404, 229...
Name: songs_id, Length: 115071, dtype: object

In [47]:
col = [song for songs in plylst_train['songs_id'] for song in songs]
col

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [48]:
dat = np.repeat(1, plylst_train['num_songs'].sum())
dat

array([1, 1, 1, ..., 1, 1, 1])

In [49]:
len(dat) # 플레이리스트에 나온 노래들의 총 개수(몇몇 노래는 여러 플레이리스트에 중복되어 나오므로 n_songs보다 숫자가 큼)

5285871

In [50]:
n_songs # 총 노래 개수

638336

In [51]:
# row가 playlist(nid)이고 column이 sid인 sparse matrix 
train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))
train_songs_A

<115071x638336 sparse matrix of type '<class 'numpy.longlong'>'
	with 5285871 stored elements in Compressed Sparse Row format>

In [52]:
# 위 과정을 마찬가지로 tag에도 적용한다. (train_tags_A 생성)
row = np.repeat(range(n_train), plylst_train['num_tags'])
col = [tag for tags in plylst_train['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_train['num_tags'].sum())
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))
train_tags_A

<115071x30197 sparse matrix of type '<class 'numpy.longlong'>'
	with 476331 stored elements in Compressed Sparse Row format>

In [53]:
# transpose
train_songs_A_T = train_songs_A.T.tocsr()
train_tags_A_T = train_tags_A.T.tocsr()

In [54]:
train_songs_A_T

<638336x115071 sparse matrix of type '<class 'numpy.longlong'>'
	with 5285871 stored elements in Compressed Sparse Row format>

In [55]:
# argsort 예시
a = np.array([1.5, 0.2, 4.2, 2.5])
s = a.argsort()

In [56]:
s

array([1, 0, 3, 2])

In [57]:
s[-2:]

array([3, 2])

In [58]:
from tqdm import tqdm

def rec(pids):
  tt = 1

  res = [] # 추천 결과를 담는 곳

  for pid in pids:
    p = np.zeros((n_songs,1))
    p[test.loc[pid,'songs_id']] = 1

    # pid 플레이리스트 내에 있는 곡이  train 데이터 셋의 각 플레이리스트에는 몇개 담겨있는지 개수 구하기. val값이 클수록 해당 플레이리스트와 유사도가 높은 플레이리스트임.
    # 즉 val은 train 데이터 셋의 각 플레이리스트에 대한 유사도 가중치
    val = train_songs_A.dot(p).reshape(-1)
    # songs_already : 이미 플레이리스트 내에 존재하는 곡 아이디
    songs_already = test.loc[pid, "songs_id"]
    # tags_already : 이미 플레이리스트 내에 태그된 태그 아이디
    tags_already = test.loc[pid, "tags_id"]

    ##### song ######
    # val(유사도 가중치)를 바탕으로 각 곡에 대한 추천 점수 구하기
    cand_song = train_songs_A_T.dot(val)
    # 값이 큰 상위 150개의 song index 추출
    cand_song_idx = cand_song.reshape(-1).argsort()[-150:][::-1]

    # 노래 중복 확인 --> 중복이 아닌 cand_song_idx 중에서 100개의 song index 추출
    cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]
    # 진짜 노래의 id 추출 (이때까지 사용한건 sid이지만 이것은 가짜 노래의 id임)
    rec_song_idx = [song_sid_id[i] for i in cand_song_idx]  # <-- 최종 추천 노래 id

    #### tag ########
    # train데이터 셋에 들어있는 플레이리스트와의 유사도 구하는건가?
    cand_tag = train_tags_A_T.dot(val)
    # 값이 큰 상위 15개의 tag index 추출 
    cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]
    # 태그 중복 확인 --> 중복이 아닌 tag index 중에서 10개의 tag index 추출
    cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
    # 진짜 태그명 추출 (이때까지 사용한건 태그의 id)   
    rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] # <-- 최종 추천 태그 id

    res.append({
                "id": plylst_nid_id[pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx
            })
    
    if tt % 1000 == 0:
      print(tt)

    tt += 1
  return res

#### pid(플레이리스트 id)가 124803 일 경우 

In [59]:
pid = 124803

In [60]:
p = np.zeros((n_songs,1))
p.shape

(638336, 1)

In [61]:
# 플레이리스트 id가 124803인 플레이리스트의 songs_id
test.loc[pid,'songs_id']

[625209,
 479246,
 463289,
 556519,
 240419,
 43638,
 71944,
 232538,
 625210,
 40923,
 120073,
 477851,
 252757,
 15831,
 252764,
 79972,
 505967,
 6642,
 353254]

In [62]:
# 위 결과를 바탕으로 625209, 479246, ,,, 353254번째 데이터는 1로 바꾼다.(이는 즉 플레이리스트 id가 124803인 플레이리스트에는 어떤 곡이 존재하는지를 알려주기 위함이다.)
p[test.loc[pid,'songs_id']] = 1
p

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [63]:
# id가 124803인 플레이리스트에 있는 곡이 train 데이터 셋의 각 플레이리스트에는 몇개 담겨있는지 개수 구하기. 
val = train_songs_A.dot(p).reshape(-1)
val

array([0., 0., 0., ..., 0., 0., 0.])

In [64]:
val[val != 0]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 2., 1., 1., 1., 2., 1., 1., 2., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 3., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [65]:
val[val != 0].shape # train 플레이리스트 중 총 231개의 플레이리스트는 id가 124803인 플레이리스트에 수록된 곡과 중복된다. 만일 val값이 높을 수록 해당 플레이리스트는 id가 124803인 플레이리스트와 유사하다고 볼 수 있다.

(231,)

In [66]:
val.shape

(115071,)

In [67]:
train_songs_A_T.shape

(638336, 115071)

In [68]:
cand_song = train_songs_A_T.dot(val)
cand_song

array([0., 0., 0., ..., 0., 0., 0.])

In [69]:
cand_song.shape

(638336,)

In [70]:
# 값이 큰 상위 150개의 song index
cand_song.reshape(-1).argsort()[-150:][::-1]

array([ 43638,  40923,   6642,  71944,   3594,  14557,   5574,   2678,
          683,   5573,   1223,   1683,    769,   2298,   3597,   3756,
        15831,  13470,  95112,    693,   5585, 252757, 120073,   2659,
          899,  22706,  63457,   3664,   2301,   3589,   8643,  13458,
         9235,  16265,  35238,   3583,  10968,   8329,  30836,  26647,
        24678,  11186,    233,  71001,  69927,  11922,  16563,   1435,
        39794,   3582,  30124,   4844,  64264,  55407,  32991,   3596,
         5116,   9957,  10926, 122664,    709,  10989,    665,  52507,
        16267,  34322,  13879, 197852,    676,   2306,   1324,   5540,
        16310,  42420,   2034,   5333,  79972,    694,  40510,  40502,
        74367,    662,    657,  35242,   8320,  12065,  17129,  22032,
        22622,  40694,   9846,    908,   9053,    695,  16509,  45746,
         9804, 350629,  15653,  15680,  16526,   2287, 220263,   9217,
        14621,  35279, 146488,    898,    658, 345946, 363968, 106382,
      

In [115]:
cand_song_idx = cand_song.reshape(-1).argsort()[-150:][::-1]

In [116]:
# songs_already : 이미 플레이리스트 내에 존재하는 곡 아이디
songs_already = test.loc[pid, "songs_id"]
# tags_already : 이미 플레이리스트 내에 태그된 태그 아이디
tags_already = test.loc[pid, "tags_id"]

In [118]:
np.isin(cand_song_idx, songs_already)
# 150개의 song_id중에서(cand_song_idx) 이미 플레이리스트 내에 존재하는 곡(songs_already)이 있는지 확인

array([ True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [119]:
# 노래 중복 확인 --> 중복이 아닌 cand_song_idx 중에서 100개의 song index 추출
cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]

In [120]:
# 진짜 노래의 id 추출 (이때까지 사용한건 sid이지만 이것은 가짜 노래의 id임)
rec_song_idx = [song_sid_id[i] for i in cand_song_idx]
rec_song_idx


[37298,
 572238,
 622548,
 207558,
 177460,
 175230,
 439301,
 689088,
 280915,
 347872,
 505710,
 144856,
 494037,
 507545,
 331055,
 624673,
 11657,
 326424,
 645162,
 532771,
 346967,
 436693,
 28832,
 335757,
 312626,
 425904,
 220139,
 355067,
 595717,
 426804,
 550374,
 135272,
 676988,
 385098,
 142557,
 116573,
 474318,
 642282,
 592021,
 525949,
 625875,
 156833,
 284554,
 451310,
 336013,
 278886,
 142360,
 490266,
 550695,
 12397,
 101480,
 101972,
 401146,
 472296,
 140867,
 129018,
 342491,
 456354,
 219729,
 263582,
 596195,
 655888,
 506919,
 76888,
 62200,
 645602,
 52237,
 535146,
 11924,
 99287,
 258806,
 211325,
 306522,
 258508,
 663256,
 442077,
 72222,
 634718,
 423708,
 21125,
 73864,
 374865,
 267159,
 406349,
 455407,
 310974,
 596414,
 657888,
 470684,
 251692,
 543046,
 352459,
 92755,
 492580,
 499957,
 480973,
 516066,
 672232,
 294385,
 304937]

In [121]:
#### tag ########
# 유사도 구하는 건가?
cand_tag = train_tags_A_T.dot(val)
# 값이 큰 상위 15개의 tag index 추출 
cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]
# 태그 중복 확인 --> 중복이 아닌 tag index 중에서 10개의 tag index 추출
cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
# 진짜 태그명 추출 (이때까지 사용한건 태그의 id)
rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

In [122]:
rec_tag_idx

['락', '팝', '기분전환', '드라이브', '휴식', '힐링', '새벽', '잔잔한', '감성', '비오는날']

In [123]:
res = []
res.append({
            "id": plylst_nid_id[pid],
            "songs": rec_song_idx,
            "tags": rec_tag_idx
        })
# pid(플레이리스트 id)가 124803 일 경우 추천된 태그와 곡
res 

[{'id': 28228,
  'songs': [37298,
   572238,
   622548,
   207558,
   177460,
   175230,
   439301,
   689088,
   280915,
   347872,
   505710,
   144856,
   494037,
   507545,
   331055,
   624673,
   11657,
   326424,
   645162,
   532771,
   346967,
   436693,
   28832,
   335757,
   312626,
   425904,
   220139,
   355067,
   595717,
   426804,
   550374,
   135272,
   676988,
   385098,
   142557,
   116573,
   474318,
   642282,
   592021,
   525949,
   625875,
   156833,
   284554,
   451310,
   336013,
   278886,
   142360,
   490266,
   550695,
   12397,
   101480,
   101972,
   401146,
   472296,
   140867,
   129018,
   342491,
   456354,
   219729,
   263582,
   596195,
   655888,
   506919,
   76888,
   62200,
   645602,
   52237,
   535146,
   11924,
   99287,
   258806,
   211325,
   306522,
   258508,
   663256,
   442077,
   72222,
   634718,
   423708,
   21125,
   73864,
   374865,
   267159,
   406349,
   455407,
   310974,
   596414,
   657888,
   470684,
   251692

In [55]:
test.sample(5)

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
121851,0,2016-05-06 02:15:00.000,"[103576, 444258, 68556, 31900, 1850, 48760, 42...",[],22,0
129643,0,2017-11-03 10:22:24.000,"[18728, 47836, 286881, 39012, 122486, 141381, ...","[59, 278, 328]",10,3
134584,0,2019-11-12 22:57:18.000,"[415, 15580, 4940, 6879, 839, 29362, 29711, 72...",[],21,0
120499,0,2014-10-13 20:16:09.000,"[163599, 35496, 27774, 220217, 203026, 48679, ...",[2],95,1
123167,0,2018-03-10 22:15:10.000,"[126461, 171207, 71693, 74641, 327279, 74642, ...",[],19,0


In [56]:
test.index

Int64Index([124803, 122133, 126073, 134537, 117942, 133307, 134839, 124076,
            120998, 116400,
            ...
            134835, 132066, 124523, 127382, 130621, 126719, 132146, 125393,
            128840, 129643],
           dtype='int64', name='nid', length=300)

## 결과

In [79]:
answers = rec(test.index)

In [72]:
answers # 300개의 플레이리스트에 각각에 대해 100개의 곡과 10개의 태그 예측한 결과값

[{'id': 28228,
  'songs': [37298,
   572238,
   622548,
   207558,
   177460,
   175230,
   439301,
   689088,
   280915,
   347872,
   505710,
   144856,
   494037,
   507545,
   331055,
   624673,
   11657,
   326424,
   645162,
   532771,
   346967,
   436693,
   28832,
   335757,
   312626,
   425904,
   220139,
   355067,
   595717,
   426804,
   550374,
   135272,
   676988,
   385098,
   142557,
   116573,
   474318,
   642282,
   592021,
   525949,
   625875,
   156833,
   284554,
   451310,
   336013,
   278886,
   142360,
   490266,
   550695,
   12397,
   101480,
   101972,
   401146,
   472296,
   140867,
   129018,
   342491,
   456354,
   219729,
   263582,
   596195,
   655888,
   506919,
   76888,
   62200,
   645602,
   52237,
   535146,
   11924,
   99287,
   258806,
   211325,
   306522,
   258508,
   663256,
   442077,
   72222,
   634718,
   423708,
   21125,
   73864,
   374865,
   267159,
   406349,
   455407,
   310974,
   596414,
   657888,
   470684,
   251692

In [73]:
len(answers)

300

In [74]:
write_json(answers, "results/results.json")

In [75]:
evaluator = CustomEvaluator()
evaluator.evaluate("val.json", "arena_data/results/results.json")

float division by zero


In [61]:
evaluator = CustomEvaluator()
evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/results.json")


[Errno 2] No such file or directory: 'arena_data/answers/val.json'


In [80]:
test

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
124803,0,2010-04-11 10:50:25.000,"[625209, 479246, 463289, 556519, 240419, 43638...",[],19,0
122133,0,2014-11-25 17:42:03.000,"[121068, 18863, 917, 263984, 148786, 7404, 190...",[],26,0
126073,0,2017-09-15 20:56:11.000,"[40061, 76488, 5950, 24720, 3305, 34443, 5904,...",[15529],35,1
134537,0,2014-05-15 16:26:42.000,"[10835, 48733, 42865, 42291, 5026, 5154, 15750...","[20, 31]",12,2
117942,0,2018-06-30 14:16:20.000,"[424959, 13187, 218305, 33287, 83254, 12968, 6...",[],13,0
...,...,...,...,...,...,...
126719,0,2018-05-29 09:49:05.000,[],"[110, 54, 1242, 83]",0,4
132146,0,2013-12-07 08:26:52.000,"[16360, 103980, 103073, 9216, 683, 53047, 1154...",[],9,0
125393,0,2016-07-18 17:52:43.000,"[124498, 72312, 29225, 2969, 74082, 2072, 1221...","[5097, 29629, 7146]",13,3
128840,0,2013-01-13 23:12:06.000,"[29688, 529981, 64366, 629382, 629383, 276816,...",[],30,0


In [83]:
test = plylst_test

In [84]:
test

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
115071,0,2019-05-27 14:14:33.000,"[82770, 73350, 49850, 283466, 38811, 14654, 83...",[],27,0
115072,0,2014-07-16 15:24:24.000,[],[],0,0
115073,0,2008-06-21 23:26:22.000,"[42084, 86991, 615142, 615143, 66432, 191918, ...",[],14,0
115074,0,2017-10-30 18:15:43.000,"[19289, 156274, 92524, 5729, 9179, 4694, 3233,...",[],17,0
115075,0,2017-02-07 11:40:42.000,"[72186, 47442, 47461, 24939, 209259, 81164, 24...",[],8,0
...,...,...,...,...,...,...
138081,0,2015-12-17 14:06:05.000,"[5607, 1025, 9650, 543806, 1424, 7372, 2234, 2...",[4],48,1
138082,0,2020-04-16 21:35:44.000,"[638333, 244876, 108022, 420983, 20258, 595078...","[11913, 335, 3162, 455, 23086]",100,5
138083,0,2019-03-27 15:27:40.000,"[1435, 718, 2659, 2773, 1359, 8731, 696, 697, ...",[],12,0
138084,0,2015-11-18 11:49:09.000,"[3091, 308295, 428975, 80278, 35027, 234993, 8...",[],9,0


In [77]:
plylst_test.index

Int64Index([115071, 115072, 115073, 115074, 115075, 115076, 115077, 115078,
            115079, 115080,
            ...
            138076, 138077, 138078, 138079, 138080, 138081, 138082, 138083,
            138084, 138085],
           dtype='int64', name='nid', length=23015)

In [85]:
answers2 = rec(test.index)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000


In [86]:
write_json(answers, "results/results.json")

In [87]:
evaluator = CustomEvaluator()
evaluator.evaluate("val.json", "arena_data/results/results.json")

float division by zero
