# LGU+ 경진대회 - 베이스라인  
- [Neural Collaborative Filtering(NCF)](https://arxiv.org/pdf/1708.05031.pdf) 논문의 NeuMF를 참고하여 side-information을 결합한 모델을 PyTorch로 구현
- 구현된 모델의 검증 데이터셋과 리더보드의 성능을 확인

## 목차 
- 데이터 전처리 
    - 기본 설정
    - 데이터 불러오기 
    - 학습 및 검증 데이터 생성 
- NeuMF 구현    
    - 모델 구현 
    - 학습 및 추론 코드 구현
- 모델 학습 
    - 하이퍼파라미터 설정 & 최적화 기법 설정
    - 모델 학습 
    - 학습 과정 시각화 
- 제출 
    - 모든 유저에 대해 추천 결과 생성
    - 저장 

## 데이터 전처리
### 기본 설정

In [1]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import plotnine
from plotnine import *

import pickle

In [2]:
# 하이퍼파라미터 
class cfg: 
    gpu_idx = 0
    # device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    device = "cpu"
    top_k = 25 #############
    seed = 42
    neg_ratio = 100 #######
    test_size = 0.2

In [3]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(cfg.seed)

In [4]:
# 경로 설정
data_path = './data'
saved_path = './code/saved'
output_path = './code/submission'

### 데이터 불러오기
- history_data : 시청 시작 데이터
- profile_data : 프로필 정보 
- meta_data : 콘텐츠 일반 메타 정보

In [5]:
# 데이터 불러오기 
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
search_df = pd.read_csv(os.path.join(data_path, 'search_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')
we_df = pd.read_csv(os.path.join(data_path, 'watch_e_data.csv'), encoding='utf-8')
buy_df = pd.read_csv(os.path.join(data_path, 'buy_data.csv'), encoding='utf-8')

![history](./history.png)

![profile](./profile.png)

![meta](./meta.png)

![meta_plus](./meta_plus.png)

![watch_e](./watch_e.png)

![buy](./buy.png)

![search](./search.png)

## Make user's behavior data

In [6]:
### history selection
hsel = history_df[['profile_id','ss_id','album_id','payment','short_trailer']]

### hsel no duplicate
hsel2 = hsel.drop_duplicates(subset=['profile_id', 'album_id', 'ss_id'])

In [7]:
hsel2

Unnamed: 0,profile_id,ss_id,album_id,payment,short_trailer
0,3,20220301115653,15,,N
1,3,20220301115653,16,,N
2,3,20220301115653,17,,N
3,3,20220301115653,18,,N
4,3,20220301115653,19,,N
...,...,...,...,...,...
1005641,33032,20220427155091,381,,N
1005642,33032,20220427155091,375,,N
1005648,33032,20220427155839,125,,N
1005649,33032,20220427155706,125,,N


In [8]:
### watch_e selection
wsel = we_df[['profile_id', 'ss_id', 'album_id', 'watch_time']]

### wsel no duplicate
wsel2 = wsel.drop_duplicates(subset=['profile_id', 'album_id', 'ss_id'])

In [9]:
wsel2

Unnamed: 0,profile_id,ss_id,album_id,watch_time
0,3,20220301115653,15,46
1,3,20220301115653,16,104
2,3,20220301115653,17,76
3,3,20220301115653,18,67
4,3,20220301115653,19,90
...,...,...,...,...
892784,33032,20220427155091,381,462
892786,33032,20220427155091,125,6
892791,33032,20220427155839,125,10
892792,33032,20220427155706,125,6


In [11]:
### meta selection
msel = meta_df[['album_id', 'run_time']]

### wsel no duplicate
msel2 = msel.drop_duplicates(subset=['album_id', 'run_time'])

In [12]:
### merge data
hw = pd.merge(hsel2, wsel2, left_on = ['profile_id','ss_id','album_id'], right_on = ['profile_id','ss_id','album_id'], 
        how = 'inner')

hwm = pd.merge(hw, msel2, left_on = 'album_id', right_on = 'album_id', how = 'left')

In [14]:
### get rid of trailer
data = hwm[hwm['short_trailer']=='N']

In [15]:
### select need variable
usedata = data.loc[:,data.columns.difference(['ss_id','short_trailer'])]

In [16]:
usedata[(usedata['album_id']==4083) & (usedata['profile_id']==32216)]

Unnamed: 0,album_id,payment,profile_id,run_time,watch_time
575368,4083,,32216,74,74
575379,4083,,32216,74,74
575401,4083,,32216,74,74
575436,4083,,32216,74,74
575458,4083,,32216,74,74
575474,4083,,32216,74,74
575496,4083,,32216,74,74
575526,4083,,32216,74,74
575553,4083,,32216,74,74
575591,4083,,32216,74,74


In [17]:
### cumsum
def cum_sum(group):
    group['cum_time'] = group.sort_values(
        by=['profile_id', 'album_id'], ascending=False).watch_time.cumsum()
    return group

In [19]:
### making cum_sum
cum_data = usedata.groupby(["profile_id","album_id"]).apply(cum_sum)

In [30]:
sum(cum_data['watch'].sort_values(ascending=False) > 50)

589

In [21]:
### create watch
cum_data["watch"] = cum_data["cum_time"]/cum_data["run_time"]

In [31]:
### create y/n watch
cum_data["yes_watch"] = 0
cum_data["yes_watch"][cum_data["watch"] >= 0.5] = 1

In [32]:
### slicing cumsum data
new = cum_data[["profile_id","album_id","watch"]].sort_values(
    ["profile_id", "album_id",'watch'],ascending = False).drop_duplicates(subset=["profile_id", "album_id"])

In [33]:
cum_data[cum_data['watch'] == cum_data['watch'].max()]

Unnamed: 0,album_id,payment,profile_id,run_time,watch_time,cum_time,watch,yes_watch
68221,850,,2794,98,203,10863,110.846939,1


In [34]:
### sorting value
wdata = new.sort_values(["profile_id", "album_id"])

In [35]:
wdata

Unnamed: 0,profile_id,album_id,watch
0,3,15,1.000000
1,3,16,0.990476
2,3,17,1.000000
3,3,18,0.985294
4,3,19,1.000000
...,...,...,...
586923,33032,3559,1.000000
586926,33032,4246,0.254950
586911,33032,6693,0.998498
586959,33032,7105,2.479751


In [36]:
### save pickle
with open('watching2.pickle', 'wb') as f:
    pickle.dump(wdata, f)

### make behavior data

In [None]:
### select columns
b = cum_data[['profile_id','album_id','yes_watch','payment']]

In [None]:
### make need values
# buy
bdf = buy_df[['profile_id','album_id']]
bdf['buy'] = 1

# search
sdf = search_df[['profile_id','album_id']]
sdf['search'] = 1

In [None]:
m = pd.merge(b, bdf, left_on = ['profile_id','album_id'], right_on = ['profile_id','album_id'], how = 'left')
m2 = pd.merge(m, sdf, left_on = ['profile_id','album_id'], right_on = ['profile_id','album_id'], how = 'left')

In [None]:
m2['buy'][m2['payment']>0] = 1

In [None]:
mfinal = m2.fillna(0)
mfinal.columns.difference(['payment'])

In [None]:
mfinal.to_csv("behavior.csv", index= False)

In [None]:
### save pickle
with open('behavior2.pickle', 'wb') as f:
    pickle.dump(mfinal, f)