In [1]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import plotnine
from plotnine import *

import pickle

In [2]:
# 경로 설정
data_path = './data'
saved_path = './code/saved'
output_path = './code/submission'

In [3]:
# 데이터 불러오기 
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
search_df = pd.read_csv(os.path.join(data_path, 'search_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')
we_df = pd.read_csv(os.path.join(data_path, 'watch_e_data.csv'), encoding='utf-8')
buy_df = pd.read_csv(os.path.join(data_path, 'buy_data.csv'), encoding='utf-8')

## Make user's behavior data

In [4]:
### history selection
hsel = history_df[['profile_id','ss_id','album_id','payment','short_trailer']]

### hsel no duplicate
hsel2 = hsel.drop_duplicates(subset=['profile_id', 'album_id', 'ss_id'])

In [5]:
hsel2

Unnamed: 0,profile_id,ss_id,album_id,payment,short_trailer
0,3,20220301115653,15,,N
1,3,20220301115653,16,,N
2,3,20220301115653,17,,N
3,3,20220301115653,18,,N
4,3,20220301115653,19,,N
...,...,...,...,...,...
1005641,33032,20220427155091,381,,N
1005642,33032,20220427155091,375,,N
1005648,33032,20220427155839,125,,N
1005649,33032,20220427155706,125,,N


In [6]:
### watch_e selection
wsel = we_df[['profile_id', 'ss_id', 'album_id', 'watch_time']]

### wsel no duplicate
wsel2 = wsel.drop_duplicates(subset=['profile_id', 'album_id', 'ss_id'])

In [9]:
wsel2

Unnamed: 0,profile_id,ss_id,album_id,watch_time
0,3,20220301115653,15,46
1,3,20220301115653,16,104
2,3,20220301115653,17,76
3,3,20220301115653,18,67
4,3,20220301115653,19,90
...,...,...,...,...
892784,33032,20220427155091,381,462
892786,33032,20220427155091,125,6
892791,33032,20220427155839,125,10
892792,33032,20220427155706,125,6


In [7]:
### meta selection
msel = meta_df[['album_id', 'run_time']]

### wsel no duplicate
msel2 = msel.drop_duplicates(subset=['album_id', 'run_time'])

In [8]:
### merge data
hw = pd.merge(hsel2, wsel2, left_on = ['profile_id','ss_id','album_id'], right_on = ['profile_id','ss_id','album_id'], 
        how = 'inner')

hwm = pd.merge(hw, msel2, left_on = 'album_id', right_on = 'album_id', how = 'left')

In [9]:
### get rid of trailer
data = hwm[hwm['short_trailer']=='N']

In [10]:
### select need variable
usedata = data.loc[:,data.columns.difference(['ss_id','short_trailer'])]

In [20]:
usedata

Unnamed: 0,album_id,payment,profile_id,run_time,watch_time
0,15,,3,46,46
1,16,,3,105,104
2,17,,3,76,76
3,18,,3,68,67
4,19,,3,90,90
...,...,...,...,...,...
586962,125,,33032,520,6
586963,381,,33032,462,462
586964,375,,33032,659,658
586965,125,,33032,520,10


In [62]:
we_df.loc[(we_df['profile_id'] == 33032) & (we_df['album_id'] == 125)]

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,watch_time,total_time,continuous_play
892632,33032,20220426104174,20220426104952,MKID049,125,4,521,0
892738,33032,20220427112116,20220427144109,MKID049,125,520,520,1
892739,33032,20220427112116,20220427144172,MKID049,125,520,520,1
892742,33032,20220427144391,20220427144395,MKID049,125,521,522,1
892756,33032,20220427145664,20220427145674,MKID049,125,0,521,0
892761,33032,20220427150199,20220427150145,MKID049,125,3,521,0
892786,33032,20220427155091,20220427155603,MKID049,125,6,520,0
892791,33032,20220427155839,20220427155826,MKID049,125,10,520,0
892792,33032,20220427155706,20220427155836,MKID049,125,6,521,0


In [60]:
usedata.loc[usedata['watch_time'] > usedata['run_time']]

Unnamed: 0,album_id,payment,profile_id,run_time,watch_time
27,74,,5,300,301
91,130,,5,172,173
115,130,,5,172,173
222,130,,5,172,173
258,15,,5,46,47
...,...,...,...,...,...
586765,4109,,32978,95,96
586830,7105,,32998,321,347
586899,126,,33027,312,381
586914,7105,,33032,321,360


In [64]:
usedata['watch_run_diff'] = usedata['watch_time'] - usedata['run_time']
usedata['watch_run_diff']

0           0
1          -1
2           0
3          -1
4           0
         ... 
586962   -514
586963      0
586964     -1
586965   -510
586966   -514
Name: watch_run_diff, Length: 565395, dtype: int64

In [65]:
usedata['watch_run_diff'].describe()

count    565395.000000
mean       -134.834700
std         282.097764
min       -7157.000000
25%        -159.000000
50%          -2.000000
75%          -1.000000
max        2785.000000
Name: watch_run_diff, dtype: float64

In [76]:
usedata.watch_run_diff.max()

2785

In [77]:
usedata.loc[(usedata['watch_run_diff'] == 2785)]

Unnamed: 0,album_id,payment,profile_id,run_time,watch_time,watch_run_diff
353563,1448,,15928,73,2858,2785


In [82]:
usedata.loc[(usedata['watch_run_diff'] > 0)].watch_run_diff.describe()

count    32093.000000
mean        34.840027
std         84.836038
min          1.000000
25%          1.000000
50%          5.000000
75%         29.000000
max       2785.000000
Name: watch_run_diff, dtype: float64

In [40]:
runtime = usedata.drop_duplicates('album_id')[['album_id', 'run_time']]

In [36]:
mean_watch = usedata.groupby('album_id')['watch_time'].mean().reset_index()
mean_watch.columns = ['album_id', 'mean_watchtime']
mean_watch

Unnamed: 0,album_id,mean_watchtime
0,0,43.737393
1,1,39.521472
2,2,52.825175
3,3,76.064935
4,4,445.456044
...,...,...
19349,25876,328.000000
19350,25893,3.000000
19351,25894,95.000000
19352,25895,11.000000


In [59]:
mean_watch.mean_watchtime.describe()

count    19354.000000
mean       252.976011
std        219.158556
min          0.000000
25%         85.054180
50%        187.500000
75%        367.239271
max       2669.000000
Name: mean_watchtime, dtype: float64

In [46]:
sum_watch = usedata.groupby('album_id')['watch_time'].sum().reset_index()
sum_watch.columns = ['album_id', 'sum_watchtime']
sum_watch

Unnamed: 0,album_id,sum_watchtime
0,0,45968
1,1,6442
2,2,7554
3,3,11714
4,4,81073
...,...,...
19349,25876,328
19350,25893,3
19351,25894,95
19352,25895,11


In [53]:
album_watch = pd.merge(mean_watch, runtime, on = 'album_id')
album_watch

Unnamed: 0,album_id,mean_watchtime,run_time
0,0,43.737393,136
1,1,39.521472,63
2,2,52.825175,96
3,3,76.064935,125
4,4,445.456044,748
...,...,...,...
19349,25876,328.000000,329
19350,25893,3.000000,96
19351,25894,95.000000,96
19352,25895,11.000000,93


In [54]:
album_watch['play_rate'] = album_watch['mean_watchtime'] / album_watch['run_time']
album_watch

Unnamed: 0,album_id,mean_watchtime,run_time,play_rate
0,0,43.737393,136,0.321598
1,1,39.521472,63,0.627325
2,2,52.825175,96,0.550262
3,3,76.064935,125,0.608519
4,4,445.456044,748,0.595529
...,...,...,...,...
19349,25876,328.000000,329,0.996960
19350,25893,3.000000,96,0.031250
19351,25894,95.000000,96,0.989583
19352,25895,11.000000,93,0.118280


In [84]:
album_watch.play_rate.describe().reset_index()

Unnamed: 0,index,play_rate
0,count,19354.0
1,mean,0.681673
2,std,0.292971
3,min,0.0
4,25%,0.513789
5,50%,0.7274
6,75%,0.915085
7,max,8.175


In [103]:
play_rate_dict = pd.merge(meta_df[['album_id']].drop_duplicates('album_id'), album_watch[['album_id', 'play_rate']], on = 'album_id', how = 'left').fillna(0).set_index('album_id').to_dict()

In [104]:
play_rate_dict

{'play_rate': {749: 0.5279325513196481,
  750: 0.5738260467268101,
  2131: 0.49761640798226164,
  2625: 0.5183712121212121,
  2594: 0.4211054994388328,
  2637: 0.6531347962382446,
  2636: 0.5828559568285595,
  748: 0.6020944741532976,
  1381: 0.6411421911421911,
  1380: 0.593801652892562,
  746: 0.5488142292490118,
  745: 0.5621581670362158,
  744: 0.5376984126984128,
  628: 0.5641327063740857,
  627: 0.6946255002858777,
  626: 0.5137716450216451,
  631: 0.6470291146761735,
  630: 0.5056116722783389,
  629: 0.525,
  6744: 0.509090909090909,
  7037: 0.6050619834710743,
  668: 0.6125199362041467,
  632: 0.65,
  817: 0.44125874125874126,
  816: 0.488984438984439,
  815: 0.48703703703703705,
  26077: 0.0,
  26078: 0.0,
  26079: 0.0,
  21481: 0.019286403085824494,
  26080: 0.0,
  13771: 0.0,
  26081: 0.0,
  19720: 0.015717092337917484,
  26082: 0.0,
  20690: 0.021589793915603533,
  26083: 0.0,
  20703: 0.16271018793273986,
  26084: 0.0,
  26068: 0.0,
  9826: 0.007931665649786455,
  25095: 0

In [105]:
### save pickle
with open('play_rate_dict.pickle', 'wb') as f:
    pickle.dump(play_rate_dict, f)