In [1]:
import pandas
import tensorflow

print(pandas.__version__)
print(tensorflow.__version__)

1.3.3
2.6.0


In [2]:
import datetime as dt
from pathlib import Path
import os
import time
from datetime import datetime
from IPython.display import display

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
data_path = Path(os.getenv('HOME')+'/aiffel/yoochoose/data/') 
train_path = data_path / 'ratings.dat'

def load_data(data_path: Path, nrows=None):
    data = pd.read_csv(data_path, sep='::', header=None, usecols=[0, 1, 2, 3], dtype={0: np.int32, 1: np.int32, 2: np.int32}, nrows=nrows)
    data.columns = ['UserId', 'ItemId', 'Rating', 'Time']
    return data

data = load_data(train_path, None)
data.sort_values(['UserId', 'Time'], inplace=True)  # data를 id와 시간 순서로 정렬해줍니다.
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103
...,...,...,...,...
1000019,6040,2917,4,997454429
999988,6040,1921,4,997454464
1000172,6040,1784,3,997454464
1000167,6040,161,3,997454486


In [22]:
# 추천시스템을 구축할 때 가장 먼저 확인해 볼 것은 유저수(세션 수)와 아이템 수
data['UserId'].nunique(), data['ItemId'].nunique()

(6040, 3706)

In [24]:
UserId_length = data.groupby('UserId').size()
UserId_length

UserId
1        53
2       129
3        51
4        21
5       198
       ... 
6036    888
6037    202
6038     20
6039    123
6040    341
Length: 6040, dtype: int64

In [25]:
UserId_length.median(), UserId_length.mean()

(96.0, 165.5975165562914)

In [26]:
UserId_length.min(), UserId_length.max()

(20, 2314)

In [27]:
UserId_length.quantile(0.999)

1343.181000000005

## 데이터 전처리

In [20]:
data['Time'] = pd.to_datetime(data['Time'], unit='s')

In [21]:
print(f"Number of unique users: {data['UserId'].nunique()}")
print(f"Number of unique movies: {data['ItemId'].nunique()}")
print(f"Rating distribution:\n{data['Rating'].value_counts()}")
print(f"Time range: {data['Time'].min()} to {data['Time'].max()}")

Number of unique users: 6040
Number of unique movies: 3706
Rating distribution:
4.0    348971
3.0    261197
5.0    226310
2.0    107557
1.0     56174
Name: Rating, dtype: int64
Time range: 2000-04-25 23:05:32 to 2003-02-28 17:49:50


## 미니 배치의 구성

In [5]:
from torch.utils.data import Dataset, DataLoader

class MovieLensDataset(Dataset):
    def __init__(self, data, max_session_length=10):
        self.data = data
        self.max_session_length = max_session_length
        self.user_sessions = data.groupby('UserId')['ItemId'].apply(list)
        
    def __len__(self):
        return len(self.user_sessions)
    
    def __getitem__(self, idx):
        session = self.user_sessions.iloc[idx]
        if len(session) > self.max_session_length:
            session = session[-self.max_session_length:]
        else:
            session = [0] * (self.max_session_length - len(session)) + session
        return np.array(session[:-1]), np.array(session[1:])
    
dataset = MovieLensDataset(data)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

## 모델 구성

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

class GRU4Rec(nn.Module):
    def __init__(self, num_items, hidden_size, output_size, num_layers=1):
        super(GRU4Rec, self).__init__()
        self.embedding = nn.Embedding(num_items, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.gru(x)
        output = self.fc(output[:, -1, :])
        return output

num_items = data['ItemId'].max() + 1
model = GRU4Rec(num_items, hidden_size=100, output_size=num_items, num_layers=1)

## 모델 학습

In [8]:
def train(model, dataloader, num_epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            inputs, targets = inputs.long(), targets.long()
            optimizer.zero_grad()
            outputs = model(inputs)  # [batch_size, num_classes]
            
            # 타겟을 크기에 맞게 조정
            targets = targets[:, -1]  # 각 시퀀스의 마지막 아이템만 사용 (next item prediction)
            
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}")

train(model, dataloader, num_epochs=10, lr=0.001)

Epoch 1/10, Loss: 8.012888858192845
Epoch 2/10, Loss: 7.0707367294713075
Epoch 3/10, Loss: 6.732755520469264
Epoch 4/10, Loss: 6.359347604450426
Epoch 5/10, Loss: 5.926783953214946
Epoch 6/10, Loss: 5.455410319880436
Epoch 7/10, Loss: 4.963454929151033
Epoch 8/10, Loss: 4.461613263581929
Epoch 9/10, Loss: 3.967296206323724
Epoch 10/10, Loss: 3.4854824693579425


## 모델 테스트

In [10]:
def evaluate(model, dataloader):
    model.eval()
    recall, mrr = 0, 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.long(), targets.long()
            outputs = model(inputs)
            
            # 각 시퀀스의 마지막 아이템만 사용
            targets = targets[:, -1]
            
            _, predicted = outputs.topk(10, dim=1)
            
            # predicted의 크기와 targets의 크기를 맞춰줌
            targets = targets.view(-1, 1)
            recall += (predicted == targets).float().sum().item()
            
            rank = (predicted == targets).nonzero(as_tuple=False)[:, 1] + 1
            mrr += (1.0 / rank.float()).sum().item()
    
    recall /= len(dataloader.dataset)
    mrr /= len(dataloader.dataset)
    print(f"Recall: {recall}, MRR: {mrr}")

evaluate(model, dataloader)

Recall: 0.9485099337748344, MRR: 0.7492384664270262
