In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter
import distutils.dir_util
import pickle
import io
import os

# 데이터 로드

In [2]:
# json write & load 함수 정의
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath(parent)
    with io.open(fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)
        
def load_json(fname):
    with open(fname, encoding='utf-8') as f:
        json_obj = json.load(f)

    return json_obj

In [5]:
# 데이터 로드
train = pd.read_json("../0_data/train.json")
test = pd.read_json("../0_data/test.json")

In [6]:
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45824 entries, 0 to 45823
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tags          45824 non-null  object
 1   id            45824 non-null  int64 
 2   plylst_title  45824 non-null  object
 3   songs         45824 non-null  object
 4   like_cnt      45824 non-null  int64 
 5   updt_date     45824 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11456 entries, 0 to 11455
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tags          11456 non-null  object
 1   id            11456 non-null  int64 
 2   plylst_title  11456 non-null  object
 3   songs         11456 non-null  object
 4   like_cnt      11456 non-null  int64 
 5   updt_date     11456 non-null  object
dtypes: int64(2), object(4)
memory usage: 537.1+ KB


(None, None)

# One-Hot Encoding

> Song, tag 빈도 수에 따른 의미있는 feature 선정

In [7]:
# 플레이리스트 내 song id 리스트
t_song = train['songs']
# 플레이리스트 내 tag 리스트
t_tag = train['tags']
# 플레이리스트 내 song id 리스트 전체 나열
song_list = [song for ply in t_song for song in ply]
# 플레이리스트 내 tag 리스트 전체 나열
tag_list = [tag for ply in t_tag for tag in ply]

In [8]:
# 전체 나열 리스트 중 각 song의 개수
count_song = Counter(song_list)
# 전체 나열 리스트 중 각 tag의 개수
count_tag = Counter(tag_list)

In [9]:
# with open('../0_data/count_song.pkl', 'wb') as f:
#     pickle.dump(count_song, f)

In [10]:
# with open('../0_data/count_tag.pkl', 'wb') as f:
#     pickle.dump(count_tag, f)

In [7]:
pd.Series(count_song).describe()

count    44674.000000
mean        25.668286
std         36.829526
min          1.000000
25%          9.000000
50%         14.000000
75%         26.000000
max        848.000000
dtype: float64

=> 곡의 빈도수 편차가 심하다. 때문에, 50% 미만 사용된 곡은 마이너 하다고 해석할 수 있으므로 해당 곡을 학습에 사용하지 않는다.

In [8]:
pd.Series(count_tag).describe()

count    3400.000000
mean       55.305588
std       292.691021
min         1.000000
25%         5.000000
50%         8.000000
75%        20.000000
max      6743.000000
dtype: float64

=> 태그의 빈도수는 편차가 더욱 심하다. 때문에, 50% 미만 사용된 태그는 일반적인 태그가 아니라고 해석할 수 있으므로 해당 태그는 학습에 사용하지 않는다.

- count_song <= 14 인 song은 학습에서 제외
- count_tag <= 8 인 tag는 학습에서 제외

In [9]:
song_thr = 14
mfl={}  # meaningful
for key, value in count_song.items():
    if value >= song_thr:
        mfl[key]=value
song_len = len(mfl)
song_len

22798

In [10]:
tag_thr = 8
for key, value in count_tag.items():
    if value>=tag_thr:
        mfl[key]=value
tag_len=len(mfl)-song_len
tag_len

1868

In [12]:
mfl_col = list(mfl.keys())
len(mfl_col) # 유의미한 song과 tag의 총 개수 (song_len+tag_len)

24666

In [27]:
with open('../0_data/mfl_col.pkl', 'wb') as f:
    pickle.dump(mfl_col, f)

In [29]:
with open('../0_data/mfl_col.pkl', 'rb') as f:
    mfl_col = pickle.load(f)

len(mfl_col)

24666

> zero matrix 생성

In [13]:
# 전체 플레이리스트를 row, 유의미한 song과 tag를 column으로 하는 zero matrix 생성
zero_matrix = np.zeros((len(train),len(mfl_col)))
zero_matrix.shape

(45824, 24666)

In [14]:
df_zero = pd.DataFrame(zero_matrix,columns=mfl_col,index=train['id'])
df_zero.head() # matrix >> dataframe 변환

Unnamed: 0_level_0,649965,143660,640591,411147,497957,222610,6054,146449,509038,200182,...,이국적인,음식,질리지않는,여름음악,흐린,색다른,deep,교향곡,히트,어버이날
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


> One-Hot Encoding

- train

In [15]:
# 각 플레이리스트에 해당하는 곡, 태그 좌표에 1 부여

def create_onehot(df,column_name):
    zero_matrix=np.zeros((len(df),len(column_name)))
    zero_df=pd.DataFrame(zero_matrix,columns=column_name,index=df['id'])
    for i in range(len(df)):
        for j in df.iloc[i,0]+df.iloc[i,3] :
            if j in column_name:
                zero_df.iloc[i,column_name.index(j)]=1
    return zero_df

In [16]:
# create_onehot 함수 활용 train data >> onehot encoding
train_onehot = create_onehot(train, mfl_col)
train_onehot.shape

(45824, 24666)

In [18]:
with open('../0_data/train_onehot.pkl', 'wb') as f:
    pickle.dump(train_onehot, f)

In [24]:
with open('../0_data/train_onehot.pkl', 'rb') as f:
    train_onehot = pickle.load(f)

train_onehot.shape

(45824, 24666)

- test

In [20]:
# create_onehot 함수 활용 test data >> onehot encoding
test_onehot = create_onehot(test, mfl_col)
test_onehot.shape

(11456, 24666)

In [25]:
with open('../0_data/test_onehot.pkl', 'wb') as f:
    pickle.dump(test_onehot, f)

In [26]:
with open('../0_data/test_onehot.pkl', 'rb') as f:
    test_onehot = pickle.load(f)

test_onehot.shape

(11456, 24666)

---