In [19]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import plotnine
from plotnine import *

In [2]:
# 하이퍼파라미터 
class cfg: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25 #############
    seed = 42
    neg_ratio = 100 #######
    test_size = 0.2

In [3]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(cfg.seed)

In [5]:
# 경로 설정
data_path = './data'
saved_path = './code/saved'
output_path = './code/submission'

In [6]:
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')
watch_e_df = pd.read_csv(os.path.join(data_path, 'watch_e_data.csv'), encoding='utf-8')
search_df =  pd.read_csv(os.path.join(data_path, 'search_data.csv'), encoding='utf-8')
buy_df = pd.read_csv(os.path.join(data_path, 'buy_data.csv'), encoding='utf-8')
meta_plus_df = pd.read_csv(os.path.join(data_path, 'meta_data_plus.csv'), encoding='utf-8')

In [54]:
meta_df_new = meta_df[['album_id','run_time']]

In [66]:
meta_df_new = meta_df_new.drop_duplicates()

In [67]:
meta_df_new

Unnamed: 0,album_id,run_time
0,749,660
1,750,660
2,2131,660
3,2625,660
4,2594,660
...,...,...
42596,39872,443
42597,39873,477
42598,39874,466
42599,4779,293


In [63]:
meta_df_new.album_id.value_counts()

8231     4
4083     4
4932     4
5314     4
3843     4
        ..
16963    1
32433    1
32434    1
32435    1
11629    1
Name: album_id, Length: 39875, dtype: int64

In [64]:
meta_df.loc[meta_df['album_id']==8231]

Unnamed: 0,album_id,title,sub_title,genre_large,genre_mid,genre_small,country,run_time,onair_date,cast_1,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7
13526,8231,알록달록 물고기,노래율동,키즈,노래율동,,한국,64,,,,,,,,
13527,8231,알록달록 물고기,만 3세를 위한 영상,키즈,노래율동,,한국,64,,,,,,,,
13528,8231,알록달록 물고기,만 4세를 위한 영상,키즈,노래율동,,한국,64,,,,,,,,
13529,8231,알록달록 물고기,트니트니 율동 바다스페셜,키즈,노래율동,,한국,64,,,,,,,,


In [68]:
meta_watch_merge = pd.merge(left=watch_e_df, right = meta_df_new,how='left', on=['album_id'])

In [69]:
meta_watch_merge

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,watch_time,total_time,continuous_play,run_time
0,3,20220301115653,20220301115805,MKID049,15,46,46,1,46
1,3,20220301115653,20220301115954,MKID049,16,104,105,1,105
2,3,20220301115653,20220301120114,MKID049,17,76,76,1,76
3,3,20220301115653,20220301120226,MKID049,18,67,68,1,68
4,3,20220301115653,20220301120400,MKID049,19,90,90,0,90
...,...,...,...,...,...,...,...,...,...
892789,33032,20220427155091,20220427155653,MKID049,381,463,464,1,462
892790,33032,20220427155091,20220427155694,MKID049,381,462,463,1,462
892791,33032,20220427155839,20220427155826,MKID049,125,10,520,0,520
892792,33032,20220427155706,20220427155836,MKID049,125,6,521,0,520


In [73]:
meta_watch_merge = meta_watch_merge.sort_values(by = ['profile_id','album_id'])

In [74]:
meta_watch_merge['watch_ratio'] = meta_watch_merge['watch_time']/meta_watch_merge['run_time']

In [78]:
meta_watch_merge1 = meta_watch_merge.loc[meta_watch_merge['watch_ratio']>1]

In [77]:
meta_watch_merge2 = meta_watch_merge.loc[meta_watch_merge['watch_ratio']<=1]

In [81]:
meta_watch_merge1['watch_ratio']=1.0

In [87]:
meta_watch_merge1

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,watch_time,total_time,continuous_play,run_time,watch_ratio
333,5,20220326133022,20220326133135,MKID049,15,47,47,1,46,1.0
30,5,20220306110942,20220306111509,MKID049,74,301,301,1,300,1.0
101,5,20220309185728,20220309191241,MKID049,130,173,173,1,172,1.0
128,5,20220310180411,20220311082033,MKID049,130,173,173,1,172,1.0
279,5,20220322200537,20220322202833,MKID049,130,173,173,1,172,1.0
...,...,...,...,...,...,...,...,...,...,...
892670,33032,20220426134684,20220426135076,MKID049,2200,125,127,1,99,1.0
892677,33032,20220426134684,20220426135271,MKID049,2297,360,362,1,121,1.0
892718,33032,20220427110951,20220427111624,MKID049,2573,724,725,1,720,1.0
892661,33032,20220426131390,20220426131303,MKID049,7105,360,362,1,321,1.0


In [88]:
meta_watch_merge2

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,watch_time,total_time,continuous_play,run_time,watch_ratio
0,3,20220301115653,20220301115805,MKID049,15,46,46,1,46,1.000000
1,3,20220301115653,20220301115954,MKID049,16,104,105,1,105,0.990476
2,3,20220301115653,20220301120114,MKID049,17,76,76,1,76,1.000000
3,3,20220301115653,20220301120226,MKID049,18,67,68,1,68,0.985294
4,3,20220301115653,20220301120400,MKID049,19,90,90,0,90,1.000000
...,...,...,...,...,...,...,...,...,...,...
892748,33032,20220427144943,20220427144974,MKID049,8838,302,303,1,303,0.996700
892655,33032,20220426115550,20220426130515,MKID049,8910,300,301,1,300,1.000000
892648,33032,20220426114717,20220426114966,MKID049,15663,9,1417,0,1419,0.006342
892770,33032,20220427150528,20220427150920,MKID049,18334,64,65,1,75,0.853333


In [91]:
merge_tot = pd.concat([meta_watch_merge1,meta_watch_merge2])

In [94]:
merge_tot.album_id.nunique()

20361

In [95]:
merge_tot.sort_values(by=['profile_id','album_id','watch_ratio'])

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,watch_time,total_time,continuous_play,run_time,watch_ratio
0,3,20220301115653,20220301115805,MKID049,15,46,46,1,46,1.000000
1,3,20220301115653,20220301115954,MKID049,16,104,105,1,105,0.990476
2,3,20220301115653,20220301120114,MKID049,17,76,76,1,76,1.000000
3,3,20220301115653,20220301120226,MKID049,18,67,68,1,68,0.985294
4,3,20220301115653,20220301120400,MKID049,19,90,90,0,90,1.000000
...,...,...,...,...,...,...,...,...,...,...
892746,33032,20220427144924,20220427144918,MKID049,8838,303,304,1,303,1.000000
892655,33032,20220426115550,20220426130515,MKID049,8910,300,301,1,300,1.000000
892648,33032,20220426114717,20220426114966,MKID049,15663,9,1417,0,1419,0.006342
892771,33032,20220427150528,20220427150995,MKID049,18334,1,74,0,75,0.013333


In [100]:
tb1 = merge_tot[['profile_id','album_id','watch_ratio']].groupby(['profile_id','album_id']).sum('watch_ratio').reset_index()

In [102]:
tb1.loc[tb1['watch_ratio']<0.2]

Unnamed: 0,profile_id,album_id,watch_ratio
36,5,66,0.043584
66,5,96,0.000000
71,5,101,0.065217
108,5,138,0.018182
120,5,150,0.194805
...,...,...,...
347971,33032,4944,0.000000
347973,33032,5256,0.000000
347974,33032,6577,0.034319
347976,33032,7010,0.000000


In [116]:
tb1['watch_ratio'].sort_values(ascending=False)

38990     1911.375000
38972      942.426966
38983      867.219780
38989      858.967391
38971      684.343137
             ...     
305132       0.000000
251281       0.000000
251285       0.000000
47380        0.000000
101333       0.000000
Name: watch_ratio, Length: 347982, dtype: float64

In [109]:
tb1.loc[tb1['watch_ratio']>1900,:]

Unnamed: 0,profile_id,album_id,watch_ratio
38990,2794,3502,1911.375


In [110]:
merge_tot.loc[(merge_tot['profile_id']==2794) & (merge_tot['album_id']==3502)]

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,watch_time,total_time,continuous_play,run_time,watch_ratio
98959,2794,20220301000888,20220301001043,MKID049,3502,91,93,1,80,1.0
98960,2794,20220301000888,20220301001043,MKID049,3502,91,93,1,80,1.0
98961,2794,20220301000888,20220301001043,MKID049,3502,91,93,1,80,1.0
98962,2794,20220301000888,20220301001043,MKID049,3502,91,93,1,80,1.0
98979,2794,20220301000888,20220301002071,MKID049,3502,91,91,1,80,1.0
...,...,...,...,...,...,...,...,...,...,...
111390,2794,20220422131982,20220422134467,MKID049,3502,80,80,1,80,1.0
111391,2794,20220422131982,20220422134467,MKID049,3502,80,80,1,80,1.0
111404,2794,20220422131982,20220422135204,MKID049,3502,80,80,1,80,1.0
111405,2794,20220422131982,20220422135204,MKID049,3502,80,80,1,80,1.0


In [117]:
tb1

Unnamed: 0,profile_id,album_id,watch_ratio
0,3,15,1.000000
1,3,16,0.990476
2,3,17,1.000000
3,3,18,0.985294
4,3,19,1.000000
...,...,...,...
347977,33032,7105,5.404984
347978,33032,8838,1.996700
347979,33032,8910,1.000000
347980,33032,15663,0.006342


In [120]:
meta_genre_mid = meta_df[['album_id','genre_mid']].drop_duplicates()

In [122]:
pd.merge(left=merge_tot,right = meta_genre_mid,how='left',on='album_id')

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,watch_time,total_time,continuous_play,run_time,watch_ratio,genre_mid
0,5,20220326133022,20220326133135,MKID049,15,47,47,1,46,1.000000,노래율동
1,5,20220306110942,20220306111509,MKID049,74,301,301,1,300,1.000000,TV만화
2,5,20220309185728,20220309191241,MKID049,130,173,173,1,172,1.000000,노래율동
3,5,20220310180411,20220311082033,MKID049,130,173,173,1,172,1.000000,노래율동
4,5,20220322200537,20220322202833,MKID049,130,173,173,1,172,1.000000,노래율동
...,...,...,...,...,...,...,...,...,...,...,...
892789,33032,20220427144943,20220427144974,MKID049,8838,302,303,1,303,0.996700,외국어
892790,33032,20220426115550,20220426130515,MKID049,8910,300,301,1,300,1.000000,TV만화
892791,33032,20220426114717,20220426114966,MKID049,15663,9,1417,0,1419,0.006342,외국어
892792,33032,20220427150528,20220427150920,MKID049,18334,64,65,1,75,0.853333,외국어


In [124]:
tb2 = pd.merge(left=tb1,right = meta_genre_mid,how='left',on='album_id')

In [145]:
tb2

Unnamed: 0,profile_id,album_id,watch_ratio,genre_mid
0,3,15,1.000000,노래율동
1,3,16,0.990476,노래율동
2,3,17,1.000000,노래율동
3,3,18,0.985294,노래율동
4,3,19,1.000000,노래율동
...,...,...,...,...
347977,33032,7105,5.404984,책
347978,33032,8838,1.996700,외국어
347979,33032,8910,1.000000,TV만화
347980,33032,15663,0.006342,외국어


In [128]:
tb3 = tb2[['profile_id','album_id']].groupby(['profile_id']).nunique('alnum_id').reset_index()

In [141]:
tb3.rename(columns = {'album_id': 'album_id_cnt'},inplace=True)

In [143]:
tb4 = pd.merge(left=tb1,right = tb3 ,how='left',on='profile_id')

In [144]:
tb4.sort_values(by=['profile_id','watch_ratio'])

Unnamed: 0,profile_id,album_id,watch_ratio,album_id_cnt
13,3,28,0.975207,18
7,3,22,0.983471,18
9,3,24,0.983471,18
10,3,25,0.983471,18
11,3,26,0.983471,18
...,...,...,...,...
347927,33032,350,3.956944,66
347917,33032,65,4.966418,66
347928,33032,373,5.360000,66
347977,33032,7105,5.404984,66
