## split_data code 분석

In [1]:
import copy
import random

import fire
import numpy as np

from arena_util import load_json
from arena_util import write_json

### dataset load

In [3]:
print("Reading data...\n")
playlists = load_json('res/train.json')
random.shuffle(playlists)
print(f"Total playlists: {len(playlists)}")

Reading data...

Total playlists: 115071


In [4]:
tot = len(playlists)
# train:val = 8:2
train = playlists[:int(tot*0.80)]
val = playlists[int(tot*0.80):]

In [5]:
print("Original train...")
write_json(train, "orig/train.json")
print("Original val...")
write_json(val, "orig/val.json")

Original train...
Original val...


### masking

In [6]:
# 객체 복사
playlists = copy.deepcopy(playlists)
tot = len(playlists)

In [7]:
# playlists 전체를 4개의 part로 구분해서 각각 다르게 전처리
song_only = playlists[:int(tot * 0.3)]
song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)]
tags_only = playlists[int(tot * 0.8):int(tot * 0.95)]
title_only = playlists[int(tot * 0.95):]

In [8]:
# 4개 part별 확인
print(f"Total: {len(playlists)}, "
        f"Song only: {len(song_only)}, "
        f"Song & Tags: {len(song_and_tags)}, "
        f"Tags only: {len(tags_only)}, "
        f"Title only: {len(title_only)}")

Total: 115071, Song only: 34521, Song & Tags: 57535, Tags only: 17261, Title only: 5754


#### masking 함수

In [9]:
def mask(playlists, mask_cols, del_cols):
    q_pl = copy.deepcopy(playlists)
    a_pl = copy.deepcopy(playlists)

    for i in range(len(playlists)):
        for del_col in del_cols:
            q_pl[i][del_col] = []
            if del_col == 'songs':
                a_pl[i][del_col] = a_pl[i][del_col][:100]
            elif del_col == 'tags':
                a_pl[i][del_col] = a_pl[i][del_col][:10]

        for col in mask_cols:
            mask_len = len(playlists[i][col])
            mask = np.full(mask_len, False)
            mask[:mask_len//2] = True
            np.random.shuffle(mask)

            q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
            a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])

    return q_pl, a_pl

In [12]:
# mask 함수 활용하여
# answer의 song, tag 개수 조정
# question masking
song_q, song_a = mask(song_only, ['songs'], ['tags'])
songtag_q, songtag_a = mask(song_and_tags, ['songs', 'tags'], [])
tag_q, tag_a = mask(tags_only, ['tags'], ['songs'])
title_q, title_a = mask(title_only, [], ['songs', 'tags'])

In [13]:
q = song_q + songtag_q + tag_q + title_q
a = song_a + songtag_a + tag_a + title_a

In [15]:
shuffle_indices = np.arange(len(q))
shuffle_indices

array([     0,      1,      2, ..., 115068, 115069, 115070])

In [17]:
np.random.shuffle(shuffle_indices)
shuffle_indices

array([110855,  13207,  14967, ...,  25416,  84238,  61752])

In [19]:
q = list(np.array(q)[shuffle_indices])
a = list(np.array(a)[shuffle_indices])

In [21]:
len(q), len(a)

(115071, 115071)

In [24]:
write_json(train_q, "questions/train_q.json")
write_json(train_a, "answers/train_a.json")

Masked val...


NameError: name '_mask_data' is not defined

In [None]:
class ArenaSplitter:
    def _split_data(self, playlists):
        tot = len(playlists)
        train = playlists[:int(tot*0.80)]
        val = playlists[int(tot*0.80):]

        return train, val

    def _mask(self, playlists, mask_cols, del_cols):
        q_pl = copy.deepcopy(playlists)
        a_pl = copy.deepcopy(playlists)

        for i in range(len(playlists)):
            for del_col in del_cols:
                q_pl[i][del_col] = []
                if del_col == 'songs':
                    a_pl[i][del_col] = a_pl[i][del_col][:100]
                elif del_col == 'tags':
                    a_pl[i][del_col] = a_pl[i][del_col][:10]

            for col in mask_cols:
                mask_len = len(playlists[i][col])
                mask = np.full(mask_len, False)
                mask[:mask_len//2] = True
                np.random.shuffle(mask)

                q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
                a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])

        return q_pl, a_pl

    def _mask_data(self, playlists):
        playlists = copy.deepcopy(playlists)
        tot = len(playlists)
        song_only = playlists[:int(tot * 0.3)]
        song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)]
        tags_only = playlists[int(tot * 0.8):int(tot * 0.95)]
        title_only = playlists[int(tot * 0.95):]

        print(f"Total: {len(playlists)}, "
              f"Song only: {len(song_only)}, "
              f"Song & Tags: {len(song_and_tags)}, "
              f"Tags only: {len(tags_only)}, "
              f"Title only: {len(title_only)}")

        song_q, song_a = self._mask(song_only, ['songs'], ['tags'])
        songtag_q, songtag_a = self._mask(song_and_tags, ['songs', 'tags'], [])
        tag_q, tag_a = self._mask(tags_only, ['tags'], ['songs'])
        title_q, title_a = self._mask(title_only, [], ['songs', 'tags'])

        q = song_q + songtag_q + tag_q + title_q
        a = song_a + songtag_a + tag_a + title_a

        shuffle_indices = np.arange(len(q))
        np.random.shuffle(shuffle_indices)

        q = list(np.array(q)[shuffle_indices])
        a = list(np.array(a)[shuffle_indices])

        return q, a

    def run(self, fname):
        random.seed(777)

        print("Reading data...\n")
        playlists = load_json(fname)
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        train, val = self._split_data(playlists)

        print("Original train...")
        write_json(train, "orig/train.json")
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        val_q, val_a = self._mask_data(val)
        write_json(val_q, "questions/val.json")
        write_json(val_a, "answers/val.json")


if __name__ == "__main__":
    fire.Fire(ArenaSplitter)
