In [1]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import plotnine
from plotnine import *

ModuleNotFoundError: No module named 'plotnine'

In [2]:
!nvidia-smi

Mon Nov 28 12:14:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 8000     On   | 00000000:1A:00.0 Off |                  Off |
| 33%   37C    P2    60W / 260W |   1130MiB / 48601MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro RTX 8000     On   | 00000000:3D:00.0 Off |                  Off |
| 33%   46C    P2   105W / 260W |  21134MiB / 48601MiB |      0%      Default |
|       

In [2]:
# 하이퍼파라미터 
class cfg: 
    gpu_idx = 2
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25 #############
    seed = 42
    neg_ratio = 100 #######
    test_size = 0.2

In [3]:
# 하이퍼 파라미터 설정 
cfg.batch_size = 256
cfg.emb_dim = 256
cfg.layer_dim = 512
cfg.dropout = 0.5
cfg.epochs = 10
cfg.learning_rate = 0.0025
cfg.reg_lambda = 0
cfg.check_epoch = 1

In [4]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(cfg.seed)

In [5]:
# 경로 설정
data_path = './data'
saved_path = './code/saved'
output_path = './code/submission'

In [6]:
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')
watch_e_df = pd.read_csv(os.path.join(data_path, 'watch_e_data.csv'), encoding='utf-8')
search_df =  pd.read_csv(os.path.join(data_path, 'search_data.csv'), encoding='utf-8')
buy_df = pd.read_csv(os.path.join(data_path, 'buy_data.csv'), encoding='utf-8')
meta_plus_df = pd.read_csv(os.path.join(data_path, 'meta_data_plus.csv'), encoding='utf-8')

## make 'watch' variable

In [7]:
import pickle

with open('watching2.pickle', 'rb') as f:
    watching2 = pickle.load(f)

with open('tag_for_all_id.pickle', 'rb') as f:
    tag = pickle.load(f)
    
with open('clusterset.pickle', 'rb') as f:
    cluster_df = pickle.load(f)

In [11]:
watch_dict = dict()
for index, profile, album, watch in watching2.itertuples():
    watch_dict[(profile, album)] = watch
    
# print(watch_dict)

In [12]:
# 데이터 전처리 (중복제거) 
# 참고 : drop_duplicates의 subset을 무엇으로 구성하냐에 따라서 제거되는 항목들이 다름 
# ex) 'profile_id', 'album_id' : 중복된 시청이력 모두 제거 / 'profile_id', 'album_id', 'log_time' : 같은 시간에 시청한 이력만 제거 
data = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)
data['rating'] = 1

cfg.n_users = data.profile_id.max()+1
cfg.n_items = data.album_id.max()+1

In [12]:
cfg.n_items

25917

In [13]:
# 학습 및 검증 데이터 분리
train, valid = train_test_split(
    data, test_size=cfg.test_size, random_state=cfg.seed,
)
print('학습 데이터 크기:', train.shape)
print('검증 데이터 크기:', valid.shape)

학습 데이터 크기: (719401, 4)
검증 데이터 크기: (179851, 4)


In [14]:
# Matrix 형태로 변환 
train = train.to_numpy()
matrix = sparse.lil_matrix((cfg.n_users, cfg.n_items))  
for (p, _, i, r) in tqdm(train):
    matrix[p, i] = r
    
train = sparse.csr_matrix(matrix)
train = train.toarray()
print("train 형태: \n", train)

  0%|          | 0/719401 [00:00<?, ?it/s]

train 형태: 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
train.shape

(33033, 25917)

In [15]:
profile_df = profile_df.set_index('profile_id')

le = LabelEncoder()
profile_df['sex'] = le.fit_transform(profile_df['sex'])
profile_df['pr_interest_keyword_cd_1'] = le.fit_transform(profile_df['pr_interest_keyword_cd_1'])
profile_df['pr_interest_keyword_cd_2'] = le.fit_transform(profile_df['pr_interest_keyword_cd_2'])
profile_df['pr_interest_keyword_cd_3'] = le.fit_transform(profile_df['pr_interest_keyword_cd_3'])
profile_df['ch_interest_keyword_cd_1'] = le.fit_transform(profile_df['ch_interest_keyword_cd_1'])
profile_df['ch_interest_keyword_cd_2'] = le.fit_transform(profile_df['ch_interest_keyword_cd_2'])
profile_df['ch_interest_keyword_cd_3'] = le.fit_transform(profile_df['ch_interest_keyword_cd_3'])

In [16]:
# 유저 특징 정보 추출 
user_features = profile_df[['age', 'sex',
                            'pr_interest_keyword_cd_1',
                            'pr_interest_keyword_cd_2',
                            'pr_interest_keyword_cd_3',
                            'ch_interest_keyword_cd_1',
                            'ch_interest_keyword_cd_2',
                            'ch_interest_keyword_cd_3']].to_dict()

In [17]:
cfg.n_pr_interest_keyword_1 = profile_df['pr_interest_keyword_cd_1'].nunique()
cfg.n_ch_interest_keyword_1 = profile_df['ch_interest_keyword_cd_1'].nunique()
cfg.n_pr_interest_keyword_2 = profile_df['pr_interest_keyword_cd_2'].nunique()
cfg.n_ch_interest_keyword_2 = profile_df['ch_interest_keyword_cd_2'].nunique()
cfg.n_pr_interest_keyword_3 = profile_df['pr_interest_keyword_cd_3'].nunique()
cfg.n_ch_interest_keyword_3 = profile_df['ch_interest_keyword_cd_3'].nunique()

In [18]:
meta_df = meta_df.set_index('album_id')

In [19]:
meta_df['genre_large'] = le.fit_transform(meta_df['genre_large'])
meta_df['genre_mid'] = le.fit_transform(meta_df['genre_mid'])
meta_df['genre_small'] = le.fit_transform(meta_df['genre_small'])
meta_df['cast_1'] = le.fit_transform(meta_df['cast_1'])
meta_df['cast_2'] = le.fit_transform(meta_df['cast_2'])
meta_df['cast_3'] = le.fit_transform(meta_df['cast_3'])
meta_df['cast_4'] = le.fit_transform(meta_df['cast_4'])
meta_df['cast_5'] = le.fit_transform(meta_df['cast_5'])
meta_df['cast_6'] = le.fit_transform(meta_df['cast_6'])
meta_df['cast_7'] = le.fit_transform(meta_df['cast_7'])

In [20]:
item_features = meta_df[['genre_large', 'genre_mid', 'genre_small',
                         'cast_1', 'cast_2', 'cast_3', 
                         'cast_4', 'cast_5', 'cast_6', 'cast_7']].to_dict()

In [22]:
le = LabelEncoder()
tag['tag'] = le.fit_transform(tag['tag'])

In [29]:
tag_dict = tag.set_index('album_id').to_dict()
tag_dict

{'tag': {749: 27,
  750: 27,
  2131: 27,
  2625: 27,
  2594: 27,
  2637: 27,
  2636: 27,
  748: 27,
  1381: 27,
  1380: 27,
  746: 27,
  745: 27,
  744: 27,
  628: 27,
  627: 27,
  626: 27,
  631: 27,
  630: 27,
  629: 27,
  6744: 27,
  7037: 27,
  668: 27,
  632: 27,
  817: 27,
  816: 27,
  815: 27,
  26077: 14,
  26078: 14,
  26079: 14,
  21481: 14,
  26080: 14,
  13771: 14,
  26081: 14,
  19720: 14,
  26082: 14,
  20690: 14,
  26083: 14,
  20703: 14,
  26084: 14,
  26068: 14,
  9826: 14,
  25095: 14,
  3881: 4,
  10487: 21,
  9460: 19,
  9459: 21,
  9458: 21,
  10513: 21,
  6733: 21,
  6732: 3,
  6731: 21,
  10512: 21,
  9808: 19,
  10515: 21,
  10514: 19,
  25328: 14,
  26085: 14,
  7856: 3,
  4503: 3,
  9814: 19,
  9809: 4,
  26086: 14,
  25322: 14,
  25311: 14,
  25317: 14,
  25318: 14,
  25319: 14,
  26087: 14,
  25312: 14,
  26088: 14,
  25326: 14,
  10605: 3,
  7884: 3,
  10325: 3,
  2088: 14,
  2087: 14,
  2086: 14,
  1981: 14,
  1455: 14,
  3886: 3,
  4506: 0,
  4505: 3,
  4

In [24]:
cluster_dict = cluster_df.set_index('album_id').to_dict()

In [26]:
# 추출한 특징 정보의 속성을 저장 
cfg.n_genre_small = meta_df['genre_small'].nunique()
cfg.n_genre_mid = meta_df['genre_mid'].nunique()
cfg.n_genre_large = meta_df['genre_large'].nunique()
cfg.n_cast_1 = meta_df['cast_1'].nunique()
cfg.n_cast_2 = meta_df['cast_2'].nunique()
cfg.n_cast_3 = meta_df['cast_3'].nunique()
cfg.n_cast_4 = meta_df['cast_4'].nunique()
cfg.n_cast_5 = meta_df['cast_5'].nunique()
cfg.n_cast_6 = meta_df['cast_6'].nunique()
cfg.n_cast_7 = meta_df['cast_7'].nunique()
cfg.tag = tag['tag'].nunique()
cfg.cluster5 = cluster_df['clus5'].nunique()
cfg.cluster8 = cluster_df['clus8'].nunique()
cfg.cluster12 = cluster_df['clus12'].nunique()

cfg.n_continuous_feats = 2 # 연속형 feature는 나이 + watch 변수 추가

In [24]:
class NeuMF(nn.Module):
    """Neural Matrix Factorization Model
        참고 문헌 : https://arxiv.org/abs/1708.05031

    예시 :
        model = NeuMF(cfg) 
        output = model.forward(user_ids, item_ids, [feat0, feat1]) 
    """
    def __init__(self, cfg):
        """ 
        Args:
            cfg : config 파일로 네트워크 생성에 필요한 정보들을 담고 있음 
        """
        super(NeuMF, self).__init__()
        self.n_users = cfg.n_users
        self.n_items = cfg.n_items
        self.emb_dim = cfg.emb_dim
        self.layer_dim = cfg.layer_dim
#         self.layer_dim2 = cfg.layer_dim2
        self.n_continuous_feats = cfg.n_continuous_feats
        self.n_genre_mid = cfg.n_genre_mid
#         self.n_genre_large = cfg.n_genre_large
        self.n_pr_interest_1 = cfg.n_pr_interest_keyword_1    
        self.n_ch_interest_1 = cfg.n_ch_interest_keyword_1 
#         self.n_pr_interest_2 = cfg.n_pr_interest_keyword_2    
#         self.n_ch_interest_2 = cfg.n_ch_interest_keyword_2 
#         self.n_pr_interest_3 = cfg.n_pr_interest_keyword_3    
#         self.n_ch_interest_3 = cfg.n_ch_interest_keyword_3 
        self.dropout = cfg.dropout
        self.build_graph()

    def build_graph(self):
        """Neural Matrix Factorization Model 생성
            구현된 모습은 위의 그림을 참고 
        """
        self.user_embedding_mf = nn.Embedding(num_embeddings=self.n_users, embedding_dim=self.emb_dim)  #256
        self.item_embedding_mf = nn.Embedding(num_embeddings=self.n_items, embedding_dim=self.emb_dim)
        
        self.user_embedding_mlp = nn.Embedding(num_embeddings=self.n_users, embedding_dim=self.emb_dim)
        self.item_embedding_mlp = nn.Embedding(num_embeddings=self.n_items, embedding_dim=self.emb_dim)
        
        
        self.sex_embedding = nn.Embedding(num_embeddings=2, embedding_dim=1)
        self.genre_mid_embeddig = nn.Embedding(num_embeddings=self.n_genre_mid, embedding_dim=self.n_genre_mid//2)
#         self.genre_large_embeddig = nn.Embedding(num_embeddings=self.n_genre_large, embedding_dim=self.n_genre_large//2)
        
        
        self.pr_interest_1_embedding = nn.Embedding(num_embeddings=self.n_pr_interest_1, embedding_dim=self.n_pr_interest_1//2)
        self.ch_interest_1_embedding = nn.Embedding(num_embeddings=self.n_ch_interest_1, embedding_dim=self.n_ch_interest_1//2)
#         self.pr_interest_2_embedding = nn.Embedding(num_embeddings=self.n_pr_interest_2, embedding_dim=self.n_pr_interest_2//2)
#         self.ch_interest_2_embedding = nn.Embedding(num_embeddings=self.n_ch_interest_2, embedding_dim=self.n_ch_interest_2//2)
#         self.pr_interest_3_embedding = nn.Embedding(num_embeddings=self.n_pr_interest_3, embedding_dim=self.n_pr_interest_3//2)
#         self.ch_interest_3_embedding = nn.Embedding(num_embeddings=self.n_ch_interest_3, embedding_dim=self.n_ch_interest_3//2)
        
        
        
        self.mlp_layers = nn.Sequential(
            nn.Linear(2*self.emb_dim +self.n_genre_mid//2 + self.n_pr_interest_1//2 + self.n_ch_interest_1//2+
#                       self.n_pr_interest_2//2 + self.n_ch_interest_2//2+self.n_pr_interest_3//2 + self.n_ch_interest_3//2+
                      self.n_continuous_feats +1, self.layer_dim),
            nn.BatchNorm1d(self.layer_dim),
            nn.ReLU(),
            nn.Dropout(p=self.dropout), 
            nn.Linear(self.layer_dim, self.layer_dim//2),
            nn.BatchNorm1d(self.layer_dim//2),
            nn.ReLU(), 
            nn.Dropout(p=self.dropout),
#             nn.Linear(self.layer_dim2, self.layer_dim2//2), 
#             nn.ReLU(), 
#             nn.Dropout(p=self.dropout),
            
        )
        self.affine_output = nn.Linear(self.layer_dim//2 + self.emb_dim, 1)
        self.apply(self._init_weights)
        

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)
    
    def forward(self, user_indices, item_indices, feats):
        """ 
        Args:
            user_indices : 유저의 인덱스 정보 
                ex) tensor([ 3100,  3100,  ..., 14195, 14195])
            item_indices : 아이템의 인덱스 정보
                ex) tensor([   50,    65,   ..., 14960, 11527])
            feats : 특징 정보 
        Returns: 
            output : 유저-아이템 쌍에 대한 추천 결과 
                ex) tensor([  9.4966,  22.0261, ..., -19.3535, -23.0212])
        """
        user_embedding_mf = self.user_embedding_mf(user_indices)
#         print(user_embedding_mf.shape)
        item_embedding_mf = self.item_embedding_mf(item_indices)
#         print(item_embedding_mf.shape)
        mf_output = torch.mul(user_embedding_mf, item_embedding_mf)  # element wise
        
        user_embedding_mlp = self.user_embedding_mlp(user_indices)
        item_embedding_mlp = self.item_embedding_mlp(item_indices)
        genre_mid_embedding_mlp = self.genre_mid_embeddig(feats[5])
#         genre_large_embedding_mlp = self.genre_large_embeddig(feats[4])
        pr_interest_1_embedding_mlp = self.pr_interest_1_embedding(feats[3])
        ch_interest_1_embedding_mlp = self.ch_interest_1_embedding(feats[4])
#         pr_interest_2_embedding_mlp = self.pr_interest_2_embedding(feats[2])
#         ch_interest_2_embedding_mlp = self.ch_interest_2_embedding(feats[3])
#         pr_interest_3_embedding_mlp = self.pr_interest_3_embedding(feats[2])
#         ch_interest_3_embedding_mlp = self.ch_interest_3_embedding(feats[3])
        
        sex_embedding_mlp = self.sex_embedding(feats[2])
        input_feature = torch.cat((user_embedding_mlp, item_embedding_mlp,genre_mid_embedding_mlp,
                                   pr_interest_1_embedding_mlp,ch_interest_1_embedding_mlp,
#                                    pr_interest_2_embedding_mlp,ch_interest_2_embedding_mlp,
#                                    pr_interest_3_embedding_mlp,ch_interest_3_embedding_mlp,
                                   sex_embedding_mlp,feats[0].unsqueeze(1),feats[1].unsqueeze(1)), -1)
        mlp_output = self.mlp_layers(input_feature)
        
        output = torch.cat([mlp_output, mf_output], dim=-1)
        output = self.affine_output(output).squeeze(-1)
        return output

In [33]:
def make_UIdataset(train, neg_ratio):
    
    UIdataset = {}
    for user_id, items_by_user in enumerate(train):
        UIdataset[user_id] = []
        # positive 샘플 계산 
        pos_item_ids = np.where(items_by_user > 0.5)[0]
        num_pos_samples = len(pos_item_ids)

        # negative 샘플 계산 (random negative sampling) 
        num_neg_samples = neg_ratio * num_pos_samples
        neg_items = np.where(items_by_user < 0.5)[0]
        neg_item_ids = np.random.choice(neg_items, min(num_neg_samples, len(neg_items)), replace=False)
        UIdataset[user_id].append(np.concatenate([pos_item_ids, neg_item_ids]))
        
        # feature 'age' 
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(user_features['age'][user_id])
        UIdataset[user_id].append(np.array(features))
        
        # feature 'watch' 
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]):
            if (user_id, item_id) in watch_dict.keys():
                features.append(watch_dict[(user_id, item_id)])
            else:
                features.append(0)
        UIdataset[user_id].append(np.array(features))
        
        # feature 'sex' 
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(user_features['sex'][user_id])
        UIdataset[user_id].append(np.array(features))
        

        
        # feature 'genre_mid'
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(item_features['genre_mid'][item_id])
        UIdataset[user_id].append(np.array(features))
        
        # feature 'tag'
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(tag_dict['tag'][item_id])
        UIdataset[user_id].append(np.array(features))
        
        # feature 'cluster8'
        features = []
        for item_id in np.concatenate([pos_item_ids, neg_item_ids]): 
            features.append(cluster_dict['clus8'][item_id])
        UIdataset[user_id].append(np.array(features))
        
        
        # label 저장  
        pos_labels = np.ones(len(pos_item_ids))
        neg_labels = np.zeros(len(neg_item_ids))
        UIdataset[user_id].append(np.concatenate([pos_labels, neg_labels]))

    return UIdataset


In [36]:
UIdataset = make_UIdataset(train, neg_ratio=cfg.neg_ratio)

In [37]:
UIdataset[3]
# 1. 나이
# 2. watching2
# 3. 성별
# 4. genre
# 5. tag
# 6. cluster8

[array([   16,    17,    18, ..., 18650, 21729,  3569]),
 array([5, 5, 5, ..., 5, 5, 5]),
 array([0.99047619, 1.        , 0.98529412, ..., 0.        , 0.        ,
        0.        ]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([4, 4, 4, ..., 3, 5, 5]),
 array([22, 22, 22, ...,  5,  2,  2]),
 array([4, 4, 4, ..., 4, 1, 1]),
 array([1., 1., 1., ..., 0., 0., 0.])]

In [38]:
def make_batchdata(user_indices, batch_idx, batch_size):
    
    batch_user_indices = user_indices[batch_idx*batch_size : (batch_idx+1)*batch_size]
    batch_user_ids = []
    batch_item_ids = []
    batch_feat0 = []
    batch_feat1 = []
    batch_feat2 = []
    batch_feat3 = []
    batch_feat4 = []
    batch_feat5 = []
    batch_labels = []
    
    for user_id in batch_user_indices:
        
        item_ids = UIdataset[user_id][0] # 시청 아이템 index
        
        feat0 = UIdataset[user_id][1] # 나이
        feat1 = UIdataset[user_id][2] # 시청시간
        feat2 = UIdataset[user_id][3] # 성별
        
        feat3 = UIdataset[user_id][4] # 장르
        feat4 = UIdataset[user_id][5] # 태그
        feat5 = UIdataset[user_id][6] # cluster8
        
        labels = UIdataset[user_id][7] # 평점
        
        user_ids = np.full(len(item_ids), user_id)
        batch_user_ids.extend(user_ids.tolist())
        batch_item_ids.extend(item_ids.tolist())
        
        batch_feat0.extend(feat0.tolist())
        batch_feat1.extend(feat1.tolist())
        batch_feat2.extend(feat2.tolist())
        batch_feat3.extend(feat3.tolist())
        batch_feat4.extend(feat4.tolist())
        batch_feat5.extend(feat5.tolist())
        
        batch_labels.extend(labels.tolist())
        
    return batch_user_ids, batch_item_ids, batch_feat0, batch_feat1, batch_feat2, batch_feat3, batch_feat4, batch_feat5, batch_labels

def update_avg(curr_avg, val, idx):
    """ 현재 epoch 까지의 평균 값을 계산 
    """
    return (curr_avg * idx + val) / (idx + 1)

In [39]:
def train_epoch(cfg, model, optimizer, criterion): 
    model.train()
    curr_loss_avg = 0.0

    user_indices = np.arange(cfg.n_users)
    np.random.RandomState(cfg.epoch).shuffle(user_indices)
    batch_num = int(len(user_indices) / cfg.batch_size) + 1
    bar = tqdm(range(batch_num), leave=False)
    
    for step, batch_idx in enumerate(bar):
        user_ids, item_ids, feat0, feat1, feat2, feat3, feat4, feat5, labels = make_batchdata(user_indices, batch_idx, cfg.batch_size)
        
        # 배치 사용자 단위로 학습
        user_ids = torch.LongTensor(user_ids).to(cfg.device)
        item_ids = torch.LongTensor(item_ids).to(cfg.device)
        
        feat0 = torch.FloatTensor(feat0).to(cfg.device) # 나이: 연속형 -> FloatTensor
        feat1 = torch.FloatTensor(feat1).to(cfg.device) # 시청시간: 연속형 -> FloatTensor
        
        # Long Tensor
        feat2 = torch.LongTensor(feat2).to(cfg.device) # 성별
        
        feat3 = torch.LongTensor(feat3).to(cfg.device) # genre
        feat4 = torch.LongTensor(feat4).to(cfg.device) # tag
        feat5 = torch.LongTensor(feat5).to(cfg.device) # cluster8
        
        labels = torch.FloatTensor(labels).to(cfg.device)
        labels = labels.view(-1, 1)

        # grad 초기화
        optimizer.zero_grad()

        # 모델 forward
        output = model.forward(user_ids, item_ids, [feat0, feat1, feat2, feat3, feat4, feat5])
        output = output.view(-1, 1)

        loss = criterion(output, labels)

        # 역전파
        loss.backward()

        # 최적화
        optimizer.step()    
        if torch.isnan(loss):
            print('Loss NAN. Train finish.')
            break
        curr_loss_avg = update_avg(curr_loss_avg, loss, step)
        
        msg = f"epoch: {cfg.epoch}, "
        msg += f"loss: {curr_loss_avg.item():.5f}, "
        msg += f"lr: {optimizer.param_groups[0]['lr']:.6f}"
        bar.set_description(msg)
    rets = {'losses': np.around(curr_loss_avg.item(), 5)}
    return rets

In [40]:
def valid_epoch(cfg, model, data, mode='valid'):
    pred_list = []
    model.eval()
    
    query_user_ids = data['profile_id'].unique() # 추론할 모든 user array 집합
    
    full_item_ids = np.array([c for c in range(cfg.n_items)]) # 추론할 모든 item array 집합
    full_item_ids_feat3 = [item_features['genre_mid'][c] for c in full_item_ids]
    full_item_ids_feat4 = [tag_dict['tag'][c] for c in full_item_ids]
    full_item_ids_feat5 = [cluster_dict['clus8'][c] for c in full_item_ids] # cluster8
    
    for user_id in tqdm(query_user_ids):
        with torch.no_grad():
            user_ids = np.full(cfg.n_items, user_id)
            
            user_ids = torch.LongTensor(user_ids).to(cfg.device)
            item_ids = torch.LongTensor(full_item_ids).to(cfg.device)
            
            # 사용자 feature
            feat0 = np.full(cfg.n_items, user_features['age'][user_id]) # age
            feat0 = torch.FloatTensor(feat0).to(cfg.device)
            
            # feature1 'watch'
            user_item_watch=[]
            for item_id in full_item_ids:
                if (user_id, item_id) in watch_dict.keys():
                    user_item_watch.append(watch_dict[(user_id, item_id)])
                else:
                    user_item_watch.append(0)
            
            feat1 = np.array(user_item_watch)
            feat1 = torch.FloatTensor(feat1).to(cfg.device)
            
            feat2 = np.full(cfg.n_items, user_features['sex'][user_id]) # sex
            feat2 = torch.LongTensor(feat2).to(cfg.device)
            
            # 아이템 feature
            feat3 = torch.LongTensor(full_item_ids_feat3).to(cfg.device)
            feat4 = torch.LongTensor(full_item_ids_feat4).to(cfg.device)
            feat5 = torch.LongTensor(full_item_ids_feat5).to(cfg.device)
            
            eval_output = model.forward(user_ids, item_ids, [feat0, feat1, feat2, feat3, feat4, feat5]).detach().cpu().numpy()
            pred_u_score = eval_output.reshape(-1)   
        
        pred_u_idx = np.argsort(pred_u_score)[::-1]
        pred_u = full_item_ids[pred_u_idx]
        pred_list.append(list(pred_u[:cfg.top_k]))
        
    pred = pd.DataFrame()
    pred['profile_id'] = query_user_ids
    pred['predicted_list'] = pred_list
    
    # 모델 성능 확인 
    if mode == 'valid':
        rets = evaluation(data, pred)
        return rets, pred
    return pred

In [29]:
def recallk(actual, predicted, k = 25):
    set_actual = set(actual)
    recall_k = len(set_actual & set(predicted[:k])) / min(k, len(set_actual))
    return recall_k

def unique(sequence):
    # preserves order
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def ndcgk(actual, predicted, k = 25):
    set_actual = set(actual)
    idcg = sum([1.0 / np.log(i + 2) for i in range(min(k, len(set_actual)))]) 
    dcg = 0.0
    unique_predicted = unique(predicted[:k])
    for i, r in enumerate(unique_predicted):
        if r in set_actual:
            dcg += 1.0 / np.log(i + 2)
    ndcg_k = dcg / idcg
    return ndcg_k

def evaluation(gt, pred):
    
    gt = gt.groupby('profile_id')['album_id'].unique().to_frame().reset_index()
    gt.columns = ['profile_id', 'actual_list']

    evaluated_data = pd.merge(pred, gt, how = 'left', on = 'profile_id')

    evaluated_data['Recall@25'] = evaluated_data.apply(lambda x: recallk(x.actual_list, x.predicted_list), axis=1)
    evaluated_data['NDCG@25'] = evaluated_data.apply(lambda x: ndcgk(x.actual_list, x.predicted_list), axis=1)

    recall = evaluated_data['Recall@25'].mean()
    ndcg = evaluated_data['NDCG@25'] .mean()
    coverage = (evaluated_data['predicted_list'].apply(lambda x: x[:cfg.top_k]).explode().nunique())/meta_df.index.nunique()

    score = 0.75*recall + 0.25*ndcg
    rets = {"recall" :recall, 
            "ndcg" :ndcg, 
            "coverage" :coverage, 
            "score" :score}
    return rets

In [30]:
# model 생성 및 optimizer, loss 함수 설정 
model = NeuMF(cfg).to(cfg.device)
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.reg_lambda)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.91)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')

In [31]:
total_logs = defaultdict(list)
best_scores  = 0
for epoch in range(cfg.epochs+1):
    cfg.epoch = epoch
    train_results = train_epoch(cfg, model, optimizer, criterion)
    
    scheduler.step()
    # cfg.check_epoch 번의 epoch 마다 성능 확인 
    if epoch % cfg.check_epoch == 0: 
        valid_results, _ = valid_epoch(cfg, model, valid)

        logs = {
            'Train Loss': train_results['losses'],
            f'Valid Recall@{cfg.top_k}': valid_results['recall'],
            f'Valid NDCG@{cfg.top_k}': valid_results['ndcg'],
            'Valid Coverage': valid_results['coverage'],
            'Valid Score': valid_results['score'],
            }

        # 검증 성능 확인 
        for key, value in logs.items():
            total_logs[key].append(value)

        if epoch == 0:
            print("Epoch", end=",")
            print(",".join(logs.keys()))

        print(f"{epoch:02d}  ", end="")
        print("  ".join([f"{v:0.6f}" for v in logs.values()]))
        
        # 가장 성능이 좋은 가중치 파일을 저장 
        if best_scores <= valid_results['score']: 
            best_scores = valid_results['score']
            torch.save(model.state_dict(), os.path.join(saved_path, 'model(best_scores)_newexp15.pth'))  # 

  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

Epoch,Train Loss,Valid Recall@25,Valid NDCG@25,Valid Coverage,Valid Score
00  13258.392580  0.591660  0.499085  0.248627  0.568516


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

01  4079.033690  0.615045  0.509248  0.262671  0.588596


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

02  3276.695560  0.631408  0.507447  0.258658  0.600418


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

03  2464.078120  0.649035  0.508765  0.261417  0.613967


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

04  1714.737430  0.645896  0.495131  0.273254  0.608205


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

05  1080.913090  0.649040  0.496890  0.294871  0.611003


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

06  675.606690  0.639794  0.482320  0.302545  0.600425


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

07  444.111420  0.643601  0.485914  0.310771  0.604179


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

08  304.613650  0.637845  0.480143  0.307812  0.598420


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/7299 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [32]:
model.load_state_dict(torch.load(os.path.join(saved_path, 'model(best_scores)_newexp15.pth')))

<All keys matched successfully>

In [33]:
submission_path = os.path.join(data_path, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission = valid_epoch(cfg, model, submission, mode='test')

  0%|          | 0/8311 [00:00<?, ?it/s]

In [34]:
submission.to_csv(os.path.join(output_path, 'submission_15.csv'), index = False)