## split_data code 분석

In [1]:
import copy
import random

import fire
import numpy as np

from arena_util import load_json
from arena_util import write_json

### dataset load

In [2]:
print("Reading data...\n")
playlists = load_json('res/train.json')
random.shuffle(playlists)
print(f"Total playlists: {len(playlists)}")

Reading data...

Total playlists: 115071


In [3]:
tot = len(playlists)
# train:val = 8:2
train = playlists[:int(tot*0.80)]
val = playlists[int(tot*0.80):]

### masking

In [5]:
# 객체 복사
playlists = copy.deepcopy(playlists)
tot = len(playlists)

In [6]:
# playlists 전체를 4개의 part로 구분해서 각각 다르게 전처리
song_only = playlists[:int(tot * 0.3)]
song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)]
tags_only = playlists[int(tot * 0.8):int(tot * 0.95)]
title_only = playlists[int(tot * 0.95):]

In [7]:
# 4개 part별 확인
print(f"Total: {len(playlists)}, "
        f"Song only: {len(song_only)}, "
        f"Song & Tags: {len(song_and_tags)}, "
        f"Tags only: {len(tags_only)}, "
        f"Title only: {len(title_only)}")

Total: 115071, Song only: 34521, Song & Tags: 57535, Tags only: 17261, Title only: 5754


#### masking 함수

In [8]:
def mask(playlists, mask_cols, del_cols):
    q_pl = copy.deepcopy(playlists)
    a_pl = copy.deepcopy(playlists)

    for i in range(len(playlists)):
        for del_col in del_cols:
            q_pl[i][del_col] = []
            if del_col == 'songs':
                a_pl[i][del_col] = a_pl[i][del_col][:100]
            elif del_col == 'tags':
                a_pl[i][del_col] = a_pl[i][del_col][:10]

        for col in mask_cols:
            mask_len = len(playlists[i][col])
            mask = np.full(mask_len, False)
            mask[:mask_len//2] = True
            np.random.shuffle(mask)

            q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
            a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])

    return q_pl, a_pl

In [9]:
# mask 함수 활용하여
# answer의 song, tag 개수 조정
# question masking
song_q, song_a = mask(song_only, ['songs'], ['tags'])
songtag_q, songtag_a = mask(song_and_tags, ['songs', 'tags'], [])
tag_q, tag_a = mask(tags_only, ['tags'], ['songs'])
title_q, title_a = mask(title_only, [], ['songs', 'tags'])

In [10]:
q = song_q + songtag_q + tag_q + title_q
a = song_a + songtag_a + tag_a + title_a

In [11]:
shuffle_indices = np.arange(len(q))
shuffle_indices

array([     0,      1,      2, ..., 115068, 115069, 115070])

In [12]:
np.random.shuffle(shuffle_indices)
shuffle_indices

array([93724, 13917,  2453, ..., 35402,  2068, 84189])

In [13]:
q = list(np.array(q)[shuffle_indices])
a = list(np.array(a)[shuffle_indices])

In [14]:
len(q), len(a)

(115071, 115071)

In [15]:
write_json(q, "questions/train.json")
write_json(a, "answers/train.json")