# 3. Masking data: question, answer data 생성

In [39]:
import io
import os
import json
import pandas as pd
import numpy as np
import random
import copy
import distutils.dir_util
from matplotlib import pyplot as plt

In [40]:
# json write & load 함수 정의
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath(parent)
    with io.open(fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)
        
def load_json(fname):
    with open(fname, encoding='utf-8') as f:
        json_obj = json.load(f)

    return json_obj

In [42]:
test = load_json('../0_data/test.json')

In [43]:
playlists = copy.deepcopy(test)
tot = len(playlists)
tot

11456

In [44]:
song_only = playlists[:int(tot * 0.3)] # 0.3
song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)] # 0.5
tags_only = playlists[int(tot * 0.8):int(tot * 0.95)] # 0.15
title_only = playlists[int(tot * 0.95):] # 0.05

print(f"Total: {len(playlists)}, "
        f"Song only: {len(song_only)}, "
        f"Song & Tags: {len(song_and_tags)}, "
        f"Tags only: {len(tags_only)}, "
        f"Title only: {len(title_only)}")

Total: 11456, Song only: 3436, Song & Tags: 5728, Tags only: 1719, Title only: 573


In [None]:
song_only = playlists[:int(tot * 0.3)] # 0.3
song_and_tags = playlists[:int(tot)] # 0.5
tags_only = playlists[int(tot * 0.8):int(tot * 0.95)] # 0.15
title_only = playlists[int(tot * 0.95):] # 0.05

print(f"Total: {len(playlists)}, "
        f"Song & Tags: {len(song_and_tags)}")

In [45]:
def mask(playlists, mask_cols, del_cols):
    q_pl = copy.deepcopy(playlists)
    a_pl = copy.deepcopy(playlists)

    for i in range(len(playlists)):
        for del_col in del_cols:
            q_pl[i][del_col] = []
            if del_col == 'songs':
                a_pl[i][del_col] = a_pl[i][del_col][:100]
            elif del_col == 'tags':
                a_pl[i][del_col] = a_pl[i][del_col][:10]

        for col in mask_cols:
            mask_len = len(playlists[i][col])
            mask = np.full(mask_len, False) # mask_len 만큼 False로 채워진 행렬 생성
            mask[:mask_len//2] = True # mask_len의 절반을 True로 바꿈
            np.random.shuffle(mask) 

            q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
            a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])
            
    return q_pl, a_pl

In [None]:
song_q, song_a = mask(song_only, ['songs'], ['tags'])
songtag_q, songtag_a = mask(song_and_tags, ['songs', 'tags'], [])
tag_q, tag_a = mask(tags_only, ['tags'], ['songs'])
title_q, title_a = mask(title_only, [], ['songs', 'tags'])

q = song_q + songtag_q + tag_q + title_q
a = song_a + songtag_a + tag_a + title_a

shuffle_indices = np.arange(len(q))
np.random.shuffle(shuffle_indices)

q = list(np.array(q)[shuffle_indices])
a = list(np.array(a)[shuffle_indices])

In [46]:
song_q, song_a = mask(song_only, ['songs'], ['tags'])
songtag_q, songtag_a = mask(song_and_tags, ['songs', 'tags'], [])
tag_q, tag_a = mask(tags_only, ['tags'], ['songs'])
title_q, title_a = mask(title_only, [], ['songs', 'tags'])

q = song_q + songtag_q + tag_q + title_q
a = song_a + songtag_a + tag_a + title_a

shuffle_indices = np.arange(len(q))
np.random.shuffle(shuffle_indices)

q = list(np.array(q)[shuffle_indices])
a = list(np.array(a)[shuffle_indices])

In [47]:
write_json(q, "../0_data/q_test.json")
write_json(a, "../0_data/a_test.json")