## 데이터 전처리
### 기본 설정

In [1]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import plotnine
from plotnine import *

import pickle

In [2]:
# 경로 설정
data_path = './data'
saved_path = './code/saved'
output_path = './code/submission'

### 데이터 불러오기
- history_data : 시청 시작 데이터
- profile_data : 프로필 정보 
- meta_data : 콘텐츠 일반 메타 정보

In [3]:
# 데이터 불러오기 
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
search_df = pd.read_csv(os.path.join(data_path, 'search_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')
we_df = pd.read_csv(os.path.join(data_path, 'watch_e_data.csv'), encoding='utf-8')
buy_df = pd.read_csv(os.path.join(data_path, 'buy_data.csv'), encoding='utf-8')

## 앨범 관심 시청별 최다 키워드

In [4]:
### 키워드 별 인원을 나타내기위해 하나로 모으기
keyword1 = profile_df[['profile_id', 'pr_interest_keyword_cd_1', 'ch_interest_keyword_cd_1']]
keyword2 = profile_df[['profile_id', 'pr_interest_keyword_cd_2', 'ch_interest_keyword_cd_2']]
keyword3 = profile_df[['profile_id', 'pr_interest_keyword_cd_3', 'ch_interest_keyword_cd_3']]

In [5]:
### cbind하기 전 이름 통일
keyword1.columns = ["profile_id","pr_int","ch_int"]
keyword2.columns = ["profile_id","pr_int","ch_int"]
keyword3.columns = ["profile_id","pr_int","ch_int"]

In [6]:
### 위/아래로 합치기 - 행 기준
keyword = pd.concat([keyword1, keyword2, keyword3], axis = 0).dropna(axis=0)

In [7]:
keyword

Unnamed: 0,profile_id,pr_int,ch_int
0,3,P02,K01
1,5,P07,K05
2,7,P05,K06
3,12,P03,K09
4,16,P03,K01
...,...,...,...
8305,33019,P01,K09
8307,33023,P07,K05
8308,33026,P08,K06
8309,33027,P06,K05


In [7]:
new = we_df[['profile_id', 'album_id']].drop_duplicates(['profile_id', 'album_id'])

In [8]:
new

Unnamed: 0,profile_id,album_id
0,3,15
1,3,16
2,3,17
3,3,18
4,3,19
...,...,...
892777,33032,0
892778,33032,1725
892780,33032,3128
892782,33032,3625


In [9]:
new_meta = meta_df.drop_duplicates('album_id')[['album_id', 'genre_mid']]

In [10]:
new_k = pd.merge(new, new_meta, on = 'album_id', how = 'left')

In [11]:
new_k

Unnamed: 0,profile_id,album_id,genre_mid
0,3,15,노래율동
1,3,16,노래율동
2,3,17,노래율동
3,3,18,노래율동
4,3,19,노래율동
...,...,...,...
347977,33032,0,외국어
347978,33032,1725,노래율동
347979,33032,3128,외국어
347980,33032,3625,노래율동


In [12]:
kk = pd.merge(new_k, keyword, on = 'profile_id', how = 'inner')

In [13]:
kk

Unnamed: 0,profile_id,album_id,genre_mid,pr_int,ch_int
0,3,15,노래율동,P02,K01
1,3,15,노래율동,P04,K03
2,3,15,노래율동,P07,K04
3,3,16,노래율동,P02,K01
4,3,16,노래율동,P04,K03
...,...,...,...,...,...
845738,33032,3625,노래율동,P05,K05
845739,33032,3625,노래율동,P07,K09
845740,33032,375,놀이교실,P03,K01
845741,33032,375,놀이교실,P05,K05


In [25]:
k = pd.merge(new, keyword, on = 'profile_id', how = 'inner')

In [24]:
k

Unnamed: 0,profile_id,album_id,pr_int,ch_int
0,3,15,P02,K01
1,3,15,P04,K03
2,3,15,P07,K04
3,3,16,P02,K01
4,3,16,P04,K03
...,...,...,...,...
845738,33032,3625,P05,K05
845739,33032,3625,P07,K09
845740,33032,375,P03,K01
845741,33032,375,P05,K05


In [14]:
### 키워드 별 인원수 세기
kpcount = keyword.groupby('pr_int')['profile_id'].count()
kccount = keyword.groupby('ch_int')['profile_id'].count()

In [26]:
kpcount

pr_int
P01    2102
P02    3112
P03    2881
P04    1773
P05    1837
P06    3780
P07    3238
P08    1560
Name: profile_id, dtype: int64

In [15]:
### 부모, 자녀 키워드 앨범, 키워드별 각각 인원수 세기
kk2 = pd.DataFrame(kk.groupby(['genre_mid','pr_int']).count()).reset_index()
kk3 = pd.DataFrame(kk.groupby(['genre_mid','ch_int']).count()).reset_index()

In [16]:
kk3

Unnamed: 0,genre_mid,ch_int,profile_id,album_id,pr_int
0,MCN,K01,84,84,84
1,MCN,K02,159,159,159
2,MCN,K03,99,99,99
3,MCN,K04,111,111,111
4,MCN,K05,71,71,71
...,...,...,...,...,...
227,한자,K02,1,1,1
228,한자,K03,3,3,3
229,한자,K04,1,1,1
230,한자,K05,1,1,1


In [17]:
### 키워드를 열로 옮기기
par_with_genre_cnt = kk2.pivot(
    index = 'genre_mid',
    columns = 'pr_int',
    values = 'profile_id'
)

chi_with_genre_cnt = kk3.pivot(
    index = 'genre_mid',
    columns = 'ch_int',
    values = 'profile_id'
)

### 없는값은 0으로 채우기
### 앨범 별 키워드 인원 
par_with_genre_cnt = par_with_genre_cnt.reset_index().fillna(0)
chi_with_genre_cnt = chi_with_genre_cnt.reset_index().fillna(0)

In [19]:
kccount

ch_int
K01    4364
K02    2973
K03    3297
K04    2135
K05    1756
K06    1891
K07    1012
K08    1613
K09    1242
Name: profile_id, dtype: int64

In [20]:
chi_with_genre_cnt

ch_int,genre_mid,K01,K02,K03,K04,K05,K06,K07,K08,K09
0,MCN,84.0,159.0,99.0,111.0,71.0,70.0,39.0,45.0,67.0
1,TV만화,55067.0,35050.0,32134.0,17706.0,17600.0,19927.0,10403.0,22102.0,13631.0
2,극장판 애니,100.0,88.0,62.0,26.0,34.0,25.0,22.0,37.0,23.0
3,노래 율동,1025.0,750.0,454.0,244.0,291.0,350.0,138.0,397.0,265.0
4,노래율동,87269.0,48258.0,35601.0,13862.0,17212.0,22917.0,12043.0,24833.0,15132.0
5,놀이교실,14958.0,11608.0,10437.0,9151.0,7284.0,8233.0,3673.0,6679.0,5048.0
6,다큐멘터리,92.0,97.0,62.0,55.0,64.0,52.0,20.0,44.0,23.0
7,독서동화,602.0,438.0,452.0,269.0,165.0,248.0,163.0,156.0,142.0
8,드라마,143.0,110.0,122.0,112.0,50.0,73.0,91.0,26.0,43.0
9,수학과학,59.0,96.0,190.0,56.0,175.0,112.0,12.0,17.0,27.0


In [21]:
### 선택 인원수 별로 나누어줌
for i in range(8):
    par_with_genre_cnt.iloc[:,i+1] = par_with_genre_cnt.iloc[:,i+1]/kpcount[i]
    
for j in range(9):
    chi_with_genre_cnt.iloc[:,j+1] = chi_with_genre_cnt.iloc[:,j+1]/kccount[j]

In [22]:
### 다시 melt
par_with_genre_melt = par_with_genre_cnt.melt(
    id_vars = "genre_mid",
    value_vars = ['P01','P02','P03','P04','P05','P06','P07','P08'],
    value_name = 'viewpersel',
    var_name = "pr_int"
)

chi_with_genre_melt = chi_with_genre_cnt.melt(
    id_vars = "genre_mid",
    value_vars = ['K01','K02','K03','K04','K05','K06','K07','K08','K09'],
    value_name = 'viewpersel',
    var_name = "ch_int"
)

In [23]:
chi_with_genre_melt

Unnamed: 0,genre_mid,ch_int,viewpersel
0,MCN,K01,0.019248
1,TV만화,K01,12.618469
2,극장판 애니,K01,0.022915
3,노래 율동,K01,0.234876
4,노래율동,K01,19.997479
...,...,...,...
247,책,K09,7.291465
248,체육안전,K09,0.008857
249,코미디,K09,0.085346
250,한글,K09,0.026570


In [26]:
### 각 앨범별 최대값과 그에 해당하는 선택 키워드
par_with_genre_max = par_with_genre_melt.loc[par_with_genre_melt.groupby(['genre_mid'])['viewpersel'].idxmax()]
chi_with_genre_max = chi_with_genre_melt.loc[chi_with_genre_melt.groupby(['genre_mid'])['viewpersel'].idxmax()]

In [27]:
pr_genre_dict = par_with_genre_max[['genre_mid', 'pr_int']].set_index('genre_mid').to_dict()
ch_genre_dict = chi_with_genre_max[['genre_mid', 'ch_int']].set_index('genre_mid').to_dict()

In [28]:
ch_genre_dict

{'ch_int': {'MCN': 'K09',
  'TV만화': 'K08',
  '극장판 애니': 'K02',
  '노래 율동': 'K02',
  '노래율동': 'K01',
  '놀이교실': 'K06',
  '다큐멘터리': 'K05',
  '독서동화': 'K07',
  '드라마': 'K07',
  '수학과학': 'K05',
  '시리즈': 'K08',
  '시사교양': 'K06',
  '애니': 'K08',
  '액션/모험': 'K08',
  '어린이방송': 'K09',
  '역사/문화': 'K08',
  '연예오락': 'K04',
  '예능': 'K04',
  '예술': 'K05',
  '외국어': 'K06',
  '유아애니': 'K08',
  '육아정보': 'K04',
  '창의학습': 'K04',
  '책': 'K03',
  '체육안전': 'K09',
  '코미디': 'K02',
  '한글': 'K05',
  '한자': 'K07'}}

In [316]:
ch_genre_dict['ch_int']['TV만화']

'K08'

### 장르 결합

In [29]:
### 부모, 자녀 키워드 앨범, 키워드별 각각 인원수 세기
k2 = pd.DataFrame(k.groupby(['album_id','pr_int']).count()).reset_index()
k3 = pd.DataFrame(k.groupby(['album_id','ch_int']).count()).reset_index()

In [30]:
### 키워드를 열로 옮기기
par_cnt = k2.pivot(
    index = 'album_id',
    columns = 'pr_int',
    values = 'profile_id'
)

chi_cnt = k3.pivot(
    index = 'album_id',
    columns = 'ch_int',
    values = 'profile_id'
)

In [31]:
### 없는값은 0으로 채우기
### 앨범 별 키워드 인원 
par_cnt = par_cnt.reset_index().fillna(0)
chi_cnt = chi_cnt.reset_index().fillna(0)

In [32]:
### 선택 인원수 별로 나누어줌
for i in range(8):
    par_cnt.iloc[:,i+1] = par_cnt.iloc[:,i+1]/kpcount[i]
    
for j in range(9):
    chi_cnt.iloc[:,j+1] = chi_cnt.iloc[:,j+1]/kccount[j]

In [33]:
### 다시 melt
par_melt = par_cnt.melt(
    id_vars = "album_id",
    value_vars = ['P01','P02','P03','P04','P05','P06','P07','P08'],
    value_name = 'viewpersel',
    var_name = "pr_int"
)

chi_melt = chi_cnt.melt(
    id_vars = "album_id",
    value_vars = ['K01','K02','K03','K04','K05','K06','K07','K08','K09'],
    value_name = 'viewpersel',
    var_name = "ch_int"
)

In [34]:
### 각 앨범별 최대값과 그에 해당하는 선택 키워드
par_max = par_melt.loc[par_melt.groupby(['album_id'])['viewpersel'].idxmax()]
chi_max = chi_melt.loc[chi_melt.groupby(['album_id'])['viewpersel'].idxmax()]

## 장르와 융합

In [35]:
### 중장르 뽑기
genre = meta_df[['album_id', 'genre_mid', 'sub_title']]

In [36]:
genre = genre.drop_duplicates('album_id')

In [37]:
genre

Unnamed: 0,album_id,genre_mid,sub_title
0,749,TV만화,꼬마버스 타요1
1,750,TV만화,꼬마버스 타요1
2,2131,TV만화,꼬마버스 타요1
3,2625,TV만화,꼬마버스 타요1
4,2594,TV만화,꼬마버스 타요1
...,...,...,...
42596,39872,놀이교실,로티프렌즈 미술놀이
42597,39873,놀이교실,로티프렌즈 미술놀이
42598,39874,놀이교실,로티프렌즈 미술놀이
42599,4779,책,4-5세


In [38]:
### 부모, 자녀 각 키워드랑 융합
par_with_genre_fu = pd.merge(genre, par_with_genre_max[['genre_mid','pr_int']], on = 'genre_mid')
chi_with_genre_fu = pd.merge(genre, chi_with_genre_max[['genre_mid','ch_int']], on = 'genre_mid')

In [39]:
chi_with_genre_fu

Unnamed: 0,album_id,genre_mid,sub_title,ch_int
0,749,TV만화,꼬마버스 타요1,K08
1,750,TV만화,꼬마버스 타요1,K08
2,2131,TV만화,꼬마버스 타요1,K08
3,2625,TV만화,꼬마버스 타요1,K08
4,2594,TV만화,꼬마버스 타요1,K08
...,...,...,...,...
39850,6302,한글,EBS 초등 만점왕 - 국어 2-1,K05
39851,6303,한글,EBS 초등 만점왕 - 국어 2-1,K05
39852,6309,한글,EBS 초등 만점왕 - 국어 2-1,K05
39853,6310,한글,EBS 초등 만점왕 - 국어 2-1,K05


In [40]:
### 부모, 자녀 각 키워드랑 융합
par_fu = pd.merge(genre, par_max[['album_id','pr_int']],left_on='album_id', right_on='album_id', how='right')
chi_fu = pd.merge(genre, chi_max[['album_id','ch_int']],left_on='album_id', right_on='album_id', how='right')

In [41]:
chi_fu

Unnamed: 0,album_id,genre_mid,sub_title,ch_int
0,0,외국어,디즈니 프린세스,K06
1,1,외국어,베베 뮤직스토리,K06
2,2,외국어,베베 뮤직스토리,K06
3,3,외국어,픽사,K06
4,4,TV만화,출동! 슈퍼윙스1,K08
...,...,...,...,...
20356,25912,독서동화,주니토니 NEW 동화뮤지컬,K01
20357,25913,노래율동,퓨처북 공룡 동요2,K01
20358,25914,노래율동,퓨처북 공룡 동요2,K01
20359,25915,노래율동,퓨처북 공룡 동요2,K01


In [43]:
meta_sub_title = meta_df.sub_title.value_counts().reset_index()
meta_sub_title.columns = ['sub_title', 'count']

In [44]:
meta_genre = meta_df.drop_duplicates('album_id')[['album_id', 'genre_mid']]

In [45]:
meta_genre

Unnamed: 0,album_id,genre_mid
0,749,TV만화
1,750,TV만화
2,2131,TV만화
3,2625,TV만화
4,2594,TV만화
...,...,...
42596,39872,놀이교실
42597,39873,놀이교실
42598,39874,놀이교실
42599,4779,책


In [48]:
meta_genre_with_keyowrd = pd.merge(meta_genre, chi_with_genre_max[['genre_mid','ch_int']], on = 'genre_mid', how = 'left')

In [49]:
meta_genre_with_keyowrd

Unnamed: 0,album_id,genre_mid,ch_int
0,749,TV만화,K08
1,750,TV만화,K08
2,2131,TV만화,K08
3,2625,TV만화,K08
4,2594,TV만화,K08
...,...,...,...
39870,39872,놀이교실,K06
39871,39873,놀이교실,K06
39872,39874,놀이교실,K06
39873,4779,책,K03


In [50]:
h = meta_df.drop_duplicates('album_id')[['album_id', 'sub_title']]

In [51]:
h

Unnamed: 0,album_id,sub_title
0,749,꼬마버스 타요1
1,750,꼬마버스 타요1
2,2131,꼬마버스 타요1
3,2625,꼬마버스 타요1
4,2594,꼬마버스 타요1
...,...,...
42596,39872,로티프렌즈 미술놀이
42597,39873,로티프렌즈 미술놀이
42598,39874,로티프렌즈 미술놀이
42599,4779,4-5세


In [53]:
new_chi_fu = pd.merge(h, chi_max[['album_id','ch_int']],left_on='album_id', right_on='album_id', how='left')

In [54]:
new_chi_fu

Unnamed: 0,album_id,sub_title,ch_int
0,749,꼬마버스 타요1,K08
1,750,꼬마버스 타요1,K08
2,2131,꼬마버스 타요1,K08
3,2625,꼬마버스 타요1,K08
4,2594,꼬마버스 타요1,K08
...,...,...,...
39870,39872,로티프렌즈 미술놀이,
39871,39873,로티프렌즈 미술놀이,
39872,39874,로티프렌즈 미술놀이,
39873,4779,4-5세,


In [55]:
ho = new_chi_fu[['sub_title', 'ch_int']].value_counts().reset_index().sort_values('sub_title')

In [56]:
meta_subtitle_key_dict = dict()

temp = 'a'
MAX = 0
for i in ho.itertuples():
    if temp == i[1]:
        MAX = max(MAX, i[3])
        if MAX == i[3]:
            meta_subtitle_key_dict[i[1]] = i[2]
    else:
        temp = i[1]
        meta_subtitle_key_dict[i[1]] = i[2]
        

In [57]:
meta_subtitle_key_dict

{'100분! 뽀요 인기 메들리': 'K09',
 '10월 세계 여러나라': 'K05',
 '11월 지구와 우주': 'K05',
 '12월 겨울': 'K06',
 '1월 생활도구': 'K04',
 '2월 자연과 동물': 'K05',
 '3세이하': 'K03',
 '3월 새로움이 많아요': 'K06',
 '4-5세': 'K03',
 '4남매쇼': 'K09',
 '4월 봄': 'K06',
 '5가지 뽀롱뽀롱 꿈 상담소': 'K09',
 '5분 종이접기 한반도의 공룡편': 'K08',
 '5월 몸짱 마음짱': 'K06',
 '6-7세': 'K06',
 '60분 뽀요 율동 메들리': 'K01',
 '60초툰': 'K04',
 '6월 우리동네': 'K07',
 '7월 여름과 건강': 'K06',
 '8-9세': 'K06',
 '8월 교통기관': 'K06',
 '9월 우리나라': 'K06',
 'A Day with Little Monkey': 'K06',
 'A New Baby Is Coming': 'K05',
 'A Picnic Day': 'K06',
 'A Super Halloween': 'K09',
 'A Wish For Whales': 'K05',
 'ABC Song': 'K06',
 'Alligator Eats Candies': 'K06',
 'Animals at the Zoo': 'K06',
 'A~Z까지 알파벳 배우기': 'K06',
 'Baby Bear`s New Friend': 'K06',
 'Bath Time Fun': 'K06',
 'Ben Bakes Cakes': 'K09',
 'Ben Is Too Big': 'K06',
 'Big Big Big Bread': 'K06',
 'Big Bob Small Sam': 'K06',
 'Brown Bear`s Birthday': 'K07',
 'Butterflies Fly': 'K07',
 'Cat and Rat Play Together': 'K06',
 'Collin Goes to a Toilet': 'K0

In [63]:
new_chi_try = new_chi_fu.fillna('nan')
new_chi_try

Unnamed: 0,album_id,sub_title,ch_int
0,749,꼬마버스 타요1,K08
1,750,꼬마버스 타요1,K08
2,2131,꼬마버스 타요1,K08
3,2625,꼬마버스 타요1,K08
4,2594,꼬마버스 타요1,K08
...,...,...,...
39870,39872,로티프렌즈 미술놀이,
39871,39873,로티프렌즈 미술놀이,
39872,39874,로티프렌즈 미술놀이,
39873,4779,4-5세,


In [60]:
new_chi_try.loc[new_chi_try['ch_int'] == 'nan']
# 현재 결측 19514

Unnamed: 0,album_id,sub_title,ch_int
26,26077,변신자동차 또봇2,
27,26078,변신자동차 또봇2,
28,26079,변신자동차 또봇2,
30,26080,변신자동차 또봇1,
31,13771,변신자동차 또봇1,
...,...,...,...
39870,39872,로티프렌즈 미술놀이,
39871,39873,로티프렌즈 미술놀이,
39872,39874,로티프렌즈 미술놀이,
39873,4779,4-5세,


In [64]:
cnt = 0
for i in new_chi_try.itertuples():
    if i[3] == 'nan':
        if i[2] in meta_subtitle_key_dict.keys():
            new_chi_try.loc[i[0], 'ch_int'] = meta_subtitle_key_dict[i[2]]
        else:
            cnt += 1

In [65]:
new_chi_try

Unnamed: 0,album_id,sub_title,ch_int
0,749,꼬마버스 타요1,K08
1,750,꼬마버스 타요1,K08
2,2131,꼬마버스 타요1,K08
3,2625,꼬마버스 타요1,K08
4,2594,꼬마버스 타요1,K08
...,...,...,...
39870,39872,로티프렌즈 미술놀이,
39871,39873,로티프렌즈 미술놀이,
39872,39874,로티프렌즈 미술놀이,
39873,4779,4-5세,K03


In [66]:
cnt

4747

In [357]:
meta_genre_with_keyowrd

Unnamed: 0,album_id,genre_mid,pr_int
0,749,TV만화,P04
1,750,TV만화,P04
2,2131,TV만화,P04
3,2625,TV만화,P04
4,2594,TV만화,P04
...,...,...,...
39850,6302,한글,P08
39851,6303,한글,P08
39852,6309,한글,P08
39853,6310,한글,P08


In [67]:
g_chi_try = pd.merge(new_chi_try, meta_genre_with_keyowrd[['album_id', 'genre_mid']], on = 'album_id', how = 'left')
g_chi_try

Unnamed: 0,album_id,sub_title,ch_int,genre_mid
0,749,꼬마버스 타요1,K08,TV만화
1,750,꼬마버스 타요1,K08,TV만화
2,2131,꼬마버스 타요1,K08,TV만화
3,2625,꼬마버스 타요1,K08,TV만화
4,2594,꼬마버스 타요1,K08,TV만화
...,...,...,...,...
39870,39872,로티프렌즈 미술놀이,,놀이교실
39871,39873,로티프렌즈 미술놀이,,놀이교실
39872,39874,로티프렌즈 미술놀이,,놀이교실
39873,4779,4-5세,K03,책


In [68]:
for i in g_chi_try[0:2].itertuples():
    print(i)

Pandas(Index=0, album_id=749, sub_title='꼬마버스 타요1', ch_int='K08', genre_mid='TV만화')
Pandas(Index=1, album_id=750, sub_title='꼬마버스 타요1', ch_int='K08', genre_mid='TV만화')


In [70]:
genre_cnt = 0
execption = 0
for i in g_chi_try.itertuples():
    if i[3] == 'nan':
        genre_cnt += 1
        if i[4] in ch_genre_dict['ch_int'].keys():
            g_chi_try.loc[i[0], 'ch_int'] = ch_genre_dict['ch_int'][i[4]]
        else:
            g_chi_try.loc[i[0], 'ch_int'] = 'K01'
            print(i)
            execption += 1

Pandas(Index=14296, album_id=32755, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14297, album_id=32756, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14298, album_id=32757, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14299, album_id=32758, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14300, album_id=32759, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14301, album_id=32760, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14302, album_id=32761, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14303, album_id=32762, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14304, album_id=32763, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14305, album_id=32764, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14306, album_id=32765, sub_title='교육상식', ch_int='nan', genre_mid='휴먼/감동')
Pandas(Index=14307, album_id=32766, sub_tit

In [71]:
ch_key_max = g_chi_try[['album_id', 'ch_int']]

In [80]:
ch_key_max['ch_int'].value_counts()

K08    7825
K06    7470
K09    4756
K07    4643
K05    4005
K02    3641
K01    3408
K04    3184
K03     943
Name: ch_int, dtype: int64

In [81]:
import pickle
with open('ch_key_for_all_item.pickle', 'wb') as f:
    pickle.dump(ch_key_max, f)

In [74]:
genre_cnt

4747

In [77]:
ch_genre_dict['ch_int'].keys()

dict_keys(['MCN', 'TV만화', '극장판 애니', '노래 율동', '노래율동', '놀이교실', '다큐멘터리', '독서동화', '드라마', '수학과학', '시리즈', '시사교양', '애니', '액션/모험', '어린이방송', '역사/문화', '연예오락', '예능', '예술', '외국어', '유아애니', '육아정보', '창의학습', '책', '체육안전', '코미디', '한글', '한자'])

In [410]:
meta_df.genre_mid.value_counts()
# 29개

놀이교실      11084
TV만화      10873
노래율동       7555
외국어        3342
애니         2110
예능         1719
책          1352
창의학습        799
액션/모험       701
유아애니        537
드라마         455
시리즈         390
독서동화        350
육아정보        308
시사교양        212
노래 율동       166
MCN          93
코미디          80
어린이방송        79
다큐멘터리        72
연예오락         62
수학과학         60
한글           60
역사/문화        38
한자           38
예술           21
휴먼/감동        20
극장판 애니       13
체육안전         13
Name: genre_mid, dtype: int64

In [75]:
execption

19

In [369]:
genre_cnt

4728

In [73]:
g_chi_try.loc[g_chi_try['ch_int'] == 'nan']

Unnamed: 0,album_id,sub_title,ch_int,genre_mid
