# LGU+ 경진대회 - 베이스라인  
- [Neural Collaborative Filtering(NCF)](https://arxiv.org/pdf/1708.05031.pdf) 논문의 NeuMF를 참고하여 side-information을 결합한 모델을 PyTorch로 구현
- 구현된 모델의 검증 데이터셋과 리더보드의 성능을 확인

## 목차 
- 데이터 전처리 
    - 기본 설정
    - 데이터 불러오기 
    - 학습 및 검증 데이터 생성 
- NeuMF 구현    
    - 모델 구현 
    - 학습 및 추론 코드 구현
- 모델 학습 
    - 하이퍼파라미터 설정 & 최적화 기법 설정
    - 모델 학습 
    - 학습 과정 시각화 
- 제출 
    - 모든 유저에 대해 추천 결과 생성
    - 저장 

## 데이터 전처리
### 기본 설정

In [1]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import plotnine
from plotnine import *

import pickle

In [2]:
# 하이퍼파라미터 
class cfg: 
    gpu_idx = 0
    # device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    device = "cpu"
    top_k = 25 #############
    seed = 42
    neg_ratio = 100 #######
    test_size = 0.2

In [3]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(cfg.seed)

In [4]:
# 경로 설정
data_path = './data'
saved_path = './code/saved'
output_path = './code/submission'

### 데이터 불러오기
- history_data : 시청 시작 데이터
- profile_data : 프로필 정보 
- meta_data : 콘텐츠 일반 메타 정보

In [5]:
# 데이터 불러오기 
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
search_df = pd.read_csv(os.path.join(data_path, 'search_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')
we_df = pd.read_csv(os.path.join(data_path, 'watch_e_data.csv'), encoding='utf-8')
buy_df = pd.read_csv(os.path.join(data_path, 'buy_data.csv'), encoding='utf-8')

In [6]:
history_df

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,payment,continuous_play,short_trailer
0,3,20220301115653,20220301115719,MKID003,15,,Y,N
1,3,20220301115653,20220301115809,MKID003,16,,Y,N
2,3,20220301115653,20220301115958,MKID003,17,,Y,N
3,3,20220301115653,20220301120118,MKID003,18,,Y,N
4,3,20220301115653,20220301120229,MKID003,19,,Y,N
...,...,...,...,...,...,...,...,...
1005646,33032,20220427155091,20220427155668,MKID003,381,,Y,N
1005647,33032,20220427155091,20220427155680,MKID003,381,,Y,N
1005648,33032,20220427155839,20220427155810,MKID003,125,,Y,N
1005649,33032,20220427155706,20220427155838,MKID003,125,,Y,N


In [7]:
history_df[history_df['short_trailer']=="Y"]

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,payment,continuous_play,short_trailer
20,3,20220301115653,20220301123505,MKID003,33,,Y,Y
123,5,20220309185728,20220309195543,MKID003,138,,Y,Y
144,5,20220313130615,20220313131521,MKID003,150,,Y,Y
156,5,20220318212718,20220318212809,MKID003,159,,Y,Y
157,5,20220318212718,20220318212811,MKID003,160,,Y,Y
...,...,...,...,...,...,...,...,...
1005593,33032,20220427112116,20220427112335,MKID003,14662,,Y,Y
1005594,33032,20220427112116,20220427112369,MKID003,14663,,Y,Y
1005595,33032,20220427112116,20220427112384,MKID003,14663,,Y,Y
1005596,33032,20220427112116,20220427112394,MKID003,7010,,Y,Y


In [8]:
a = history_df.groupby(['profile_id','album_id'])['short_trailer'].nunique().reset_index()
a[a['short_trailer']>1]

Unnamed: 0,profile_id,album_id,short_trailer
1702,101,1998,2
1792,103,124,2
1983,121,3001,2
1984,121,3002,2
2770,156,175,2
...,...,...,...
377461,32595,175,2
378493,32725,442,2
379188,32791,478,2
379199,32791,2491,2


In [9]:
history_df.loc[(history_df['profile_id']==32725)&(history_df['album_id']==442), :]

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,payment,continuous_play,short_trailer
1001140,32725,20220306162148,20220306180949,MKID003,442,,Y,Y
1001164,32725,20220327173549,20220327173614,MKID003,442,,Y,N
1001176,32725,20220409131626,20220409134057,MKID003,442,,Y,N
1001178,32725,20220409131626,20220409134751,MKID003,442,,Y,N


In [10]:
meta_df[meta_df['album_id']==442]

Unnamed: 0,album_id,title,sub_title,genre_large,genre_mid,genre_small,country,run_time,onair_date,cast_1,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7
40575,442,방아 찧는 호랑이,4-5세,키즈,책,,한국,390,,,,,,,,


In [11]:
we_df.loc[(we_df['profile_id']==32725)&(we_df['album_id']==442), :]

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,watch_time,total_time,continuous_play
888619,32725,20220306162148,20220306180951,MKID049,442,1,391,2
888643,32725,20220327173549,20220327174231,MKID049,442,376,391,0
888655,32725,20220409131626,20220409134726,MKID049,442,389,391,1
888657,32725,20220409131626,20220409135421,MKID049,442,389,390,1


![history](./history.png)

![profile](./profile.png)

![meta](./meta.png)

![meta_plus](./meta_plus.png)

![watch_e](./watch_e.png)

![buy](./buy.png)

![search](./search.png)

In [12]:
### 행동 패턴 데이터
data = pd.read_csv("behavior.csv")

In [13]:
### 모든 데이터의 시작은 1로
data['start'] = 1

In [14]:
### 종료 1인 경우 양수 0인 경우 음수
data['sign'] = 1
data['sign'][data['yes_watch'] == 0] = -1

In [15]:
data['yes_watch'] = data['yes_watch'] - 0.5

In [16]:
data

Unnamed: 0,profile_id,album_id,yes_watch,payment,buy,search,start,sign
0,3,15,0.5,0.0,0.0,0.0,1,1
1,3,16,0.5,0.0,0.0,0.0,1,1
2,3,16,0.5,0.0,0.0,0.0,1,1
3,3,17,0.5,0.0,0.0,0.0,1,1
4,3,18,0.5,0.0,0.0,0.0,1,1
...,...,...,...,...,...,...,...,...
752742,33032,381,0.5,0.0,0.0,0.0,1,1
752743,33032,375,0.5,0.0,0.0,0.0,1,1
752744,33032,375,0.5,0.0,0.0,0.0,1,1
752745,33032,125,0.5,0.0,0.0,0.0,1,1


In [17]:
data['score'] = (2.5 + data['yes_watch'] + (data['buy'] + data['search'])*data['sign'])/15

#### 기준 

- 0 : 구매와 검색 둘다 했는데 종료가 0  
- 1 : 구매 또는 검색 둘중 하나가 1이지만 종료가 0  
- 2 : 구매 검색 종료 전부 0  
- 3 : 구매 검색 둘다 없지만 종료 1  
- 4 : 구매 또는 검색 둘중 하나가 1이고 종료 1  
- 5 : 구매 검색 종료 모두 1  

In [18]:
behavscore = data[['profile_id', 'album_id', 'score']]

In [19]:
behavscore

Unnamed: 0,profile_id,album_id,score
0,3,15,0.2
1,3,16,0.2
2,3,16,0.2
3,3,17,0.2
4,3,18,0.2
...,...,...,...
752742,33032,381,0.2
752743,33032,375,0.2
752744,33032,375,0.2
752745,33032,125,0.2


In [20]:
### save pickle
with open('behavscore.pickle', 'wb') as f:
    pickle.dump(behavscore, f)

## 앨범 관심 시청별 최다 키워드

In [21]:
### 키워드 별 인원을 나타내기위해 하나로 모으기
keyword1 = profile_df[['profile_id', 'pr_interest_keyword_cd_1', 'ch_interest_keyword_cd_1']]
keyword2 = profile_df[['profile_id', 'pr_interest_keyword_cd_2', 'ch_interest_keyword_cd_2']]
keyword3 = profile_df[['profile_id', 'pr_interest_keyword_cd_3', 'ch_interest_keyword_cd_3']]

In [22]:
### cbind하기 전 이름 통일
keyword1.columns = ["profile_id","pr_int","ch_int"]
keyword2.columns = ["profile_id","pr_int","ch_int"]
keyword3.columns = ["profile_id","pr_int","ch_int"]

In [23]:
### 위/아래로 합치기 - 행 기준
keyword = pd.concat([keyword1, keyword2, keyword3], axis = 0).dropna(axis=0)

In [24]:
keyword

Unnamed: 0,profile_id,pr_int,ch_int
0,3,P02,K01
1,5,P07,K05
2,7,P05,K06
3,12,P03,K09
4,16,P03,K01
...,...,...,...
8305,33019,P01,K09
8307,33023,P07,K05
8308,33026,P08,K06
8309,33027,P06,K05


In [25]:
### 종료가 1인 데이터 합치기
k = pd.merge(data[['profile_id','album_id']][data['sign']==1], keyword, left_on='profile_id', right_on='profile_id', how='outer')

In [26]:
k

Unnamed: 0,profile_id,album_id,pr_int,ch_int
0,3,15.0,P02,K01
1,3,15.0,P04,K03
2,3,15.0,P07,K04
3,3,16.0,P02,K01
4,3,16.0,P04,K03
...,...,...,...,...
1435837,32982,,P08,K06
1435838,33016,,P01,K03
1435839,33016,,P03,K04
1435840,33016,,P08,K07


In [27]:
### 키워드 별 인원수 세기
kpcount = keyword.groupby('pr_int')['profile_id'].count()
kccount = keyword.groupby('ch_int')['profile_id'].count()

In [28]:
kpcount

pr_int
P01    2102
P02    3112
P03    2881
P04    1773
P05    1837
P06    3780
P07    3238
P08    1560
Name: profile_id, dtype: int64

In [29]:
### 부모, 자녀 키워드 앨범, 키워드별 각각 인원수 세기
k2 = pd.DataFrame(k.groupby(['album_id','pr_int']).count()).reset_index()
k3 = pd.DataFrame(k.groupby(['album_id','ch_int']).count()).reset_index()

In [30]:
### 키워드를 열로 옮기기
par_cnt = k2.pivot(
    index = 'album_id',
    columns = 'pr_int',
    values = 'profile_id'
)

chi_cnt = k3.pivot(
    index = 'album_id',
    columns = 'ch_int',
    values = 'profile_id'
)

In [31]:
### 없는값은 0으로 채우기
### 앨범 별 키워드 인원 
par_cnt = par_cnt.reset_index().fillna(0)
chi_cnt = chi_cnt.reset_index().fillna(0)

In [32]:
par_cnt

pr_int,album_id,P01,P02,P03,P04,P05,P06,P07,P08
0,0.0,72.0,120.0,119.0,64.0,62.0,144.0,140.0,60.0
1,1.0,14.0,43.0,39.0,23.0,19.0,43.0,38.0,13.0
2,2.0,17.0,40.0,35.0,17.0,12.0,36.0,26.0,11.0
3,3.0,9.0,37.0,32.0,26.0,20.0,26.0,41.0,7.0
4,4.0,9.0,39.0,37.0,40.0,39.0,48.0,42.0,16.0
...,...,...,...,...,...,...,...,...,...
17379,25865.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
17380,25874.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
17381,25875.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
17382,25876.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [33]:
### 선택 인원수 별로 나누어줌
for i in range(8):
    par_cnt.iloc[:,i+1] = par_cnt.iloc[:,i+1]/kpcount[i]
    
for j in range(9):
    chi_cnt.iloc[:,j+1] = chi_cnt.iloc[:,j+1]/kccount[j]

In [34]:
### 다시 melt
par_melt = par_cnt.melt(
    id_vars = "album_id",
    value_vars = ['P01','P02','P03','P04','P05','P06','P07','P08'],
    value_name = 'viewpersel',
    var_name = "pr_int"
)

chi_melt = chi_cnt.melt(
    id_vars = "album_id",
    value_vars = ['K01','K02','K03','K04','K05','K06','K07','K08','K09'],
    value_name = 'viewpersel',
    var_name = "ch_int"
)

In [35]:
### 각 앨범별 최대값과 그에 해당하는 선택 키워드
par_max = par_melt.loc[par_melt.groupby(['album_id'])['viewpersel'].idxmax()]
chi_max = chi_melt.loc[chi_melt.groupby(['album_id'])['viewpersel'].idxmax()]

## 장르와 융합

In [36]:
### 중장르 뽑기
genre = meta_df[['album_id', 'genre_mid']]

In [37]:
g2 = genre.drop_duplicates(subset='album_id')

In [38]:
### 부모, 자녀 각 키워드랑 융합
par_fu = pd.merge(g2, par_max[['album_id','pr_int']],left_on='album_id', right_on='album_id', how='right')
chi_fu = pd.merge(g2, chi_max[['album_id','ch_int']],left_on='album_id', right_on='album_id', how='right')

In [39]:
import pickle

with open('pr_key_for_item.pickle', 'wb') as f:
     pickle.dump(par_fu, f)

In [40]:
with open('ch_key_for_item.pickle', 'wb') as f:
     pickle.dump(chi_fu, f)

In [41]:
# pd.crosstab 으로 시각화한 값 직접 구하기
pd.crosstab(par_fu["genre_mid"],par_fu["pr_int"]).style.background_gradient(cmap='summer_r')

pr_int,P01,P02,P03,P04,P05,P06,P07,P08
genre_mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MCN,20,1,8,4,0,1,0,2
TV만화,459,508,391,1008,1083,185,287,748
극장판 애니,1,0,0,0,2,1,1,0
노래 율동,1,6,18,7,43,11,0,12
노래율동,111,425,422,1716,1450,381,284,282
놀이교실,285,260,410,475,603,111,139,570
다큐멘터리,2,1,6,0,0,1,6,7
독서동화,6,21,27,35,18,11,20,27
드라마,1,12,0,9,9,3,0,21
수학과학,9,0,6,2,6,0,0,25


In [42]:
# pd.crosstab 으로 시각화한 값 직접 구하기
pd.crosstab(chi_fu["genre_mid"],chi_fu["ch_int"]).style.background_gradient(cmap='summer_r')

ch_int,K01,K02,K03,K04,K05,K06,K07,K08,K09
genre_mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MCN,1,15,0,4,1,0,12,1,2
TV만화,475,384,47,163,483,726,645,1204,542
극장판 애니,1,1,1,0,0,0,1,0,1
노래 율동,17,7,0,1,7,7,8,27,24
노래율동,845,460,62,70,439,713,576,1046,860
놀이교실,114,254,38,283,317,439,430,525,453
다큐멘터리,0,3,0,1,11,0,4,1,3
독서동화,3,22,5,6,8,19,52,29,21
드라마,0,13,0,3,3,1,21,7,7
수학과학,0,0,0,2,44,0,0,0,2
