### **[데이터 로드]**

In [None]:
import pandas
import tensorflow
import datetime as dt
from pathlib import Path
from datetime import datetime
from IPython.display import display

import numpy as np
import pandas as pd
import matplotlib.pyplot 
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
from tensorflow.keras.layers import Input, Dense, Dropout, GRU
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.utils import to_categorical

In [2]:
data_path = Path(r"C:\Users\user\Desktop\MODULABS\recommender\DAY3\rec_data")
movielens_path = data_path / 'ratings.dat'

def load_data(data_path: Path, nrows=None):
    data = pd.read_csv(data_path, sep='::', header=None, usecols=[0, 1, 2, 3], dtype={0: np.int32, 1: np.int32, 2: np.int32}, nrows=nrows)
    data.columns = ['UserId', 'ItemId', 'Rating', 'Time']
    return data

data = load_data(movielens_path, None)
data.sort_values(['UserId', 'Time'], inplace=True)  # data를 id와 시간 순서로 정렬
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103
...,...,...,...,...
1000019,6040,2917,4,997454429
999988,6040,1921,4,997454464
1000172,6040,1784,3,997454464
1000167,6040,161,3,997454486


In [3]:
data['Time'] = pd.to_datetime(data['Time'], unit='s')
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,2000-12-31 22:00:19
22,1,1270,5,2000-12-31 22:00:55
27,1,1721,4,2000-12-31 22:00:55
37,1,1022,5,2000-12-31 22:00:55
24,1,2340,3,2000-12-31 22:01:43
...,...,...,...,...
1000019,6040,2917,4,2001-08-10 14:40:29
999988,6040,1921,4,2001-08-10 14:41:04
1000172,6040,1784,3,2001-08-10 14:41:04
1000167,6040,161,3,2001-08-10 14:41:26


In [4]:
data['UserId'].nunique(), data['ItemId'].nunique()

(6040, 3706)

In [5]:
session_length = data.groupby('UserId').size()
session_length

UserId
1        53
2       129
3        51
4        21
5       198
       ... 
6036    888
6037    202
6038     20
6039    123
6040    341
Length: 6040, dtype: int64

In [6]:
session_length.median(), session_length.mean()

(96.0, 165.5975165562914)

In [7]:
session_length.min(), session_length.max()

(20, 2314)

In [8]:
session_length.quantile(0.80)

253.0

In [9]:
oldest, latest = data['Time'].min(), data['Time'].max()
print(oldest) 
print(latest)

2000-04-25 23:05:32
2003-02-28 17:49:50


### **[강제 세션 구성]**

Movielens dataset에는 session이 없고 ratings만 존재하므로, ratings = click 으로 보고 세션 구성.  
웹 분석 기준상 30분간 활동이 없으면 새로운 세션으로 인식하므로, 30분 기준으로 세션 분리.  
rating 개수의 편차가 크므로 최근 한 달 데이터만 필터.  

In [10]:
month_ago = latest - dt.timedelta(180)       
data = data[data['Time'] > month_ago]     
data

Unnamed: 0,UserId,ItemId,Rating,Time
5200,36,2269,5,2002-12-22 08:05:50
5256,36,1701,4,2002-12-22 08:05:50
5184,36,2694,3,2002-12-22 08:08:14
5364,36,3786,4,2002-12-22 08:08:41
5262,36,2369,4,2002-12-22 08:09:24
...,...,...,...,...
984733,5950,1262,4,2003-02-27 18:17:38
986778,5956,469,3,2002-09-30 03:54:57
992702,5996,168,3,2002-09-03 13:12:26
992459,5996,339,4,2002-10-07 13:24:39


In [11]:
def make_sessions(data: pd.DataFrame, gap_minutes: int, min_len: int):

    gap = pd.Timedelta(minutes = gap_minutes)
    previous_time = data.groupby("UserId")["Time"].shift()
    new_session = (data["Time"] - previous_time > gap) | (data["UserId"] != data["UserId"].shift())
    data["SessionNo"] = new_session.groupby(data["UserId"]).cumsum()
    data["SessionId"] = data["UserId"].astype(str) + "_" + data["SessionNo"].astype(str)

    # clenase short session = 2
    session_len = data.groupby("SessionId").size()
    keep = session_len[session_len >= min_len].index
    data = data[data["SessionId"].isin(keep)].copy()
    
    return data

In [12]:
def split_by_date(data: pd.DataFrame, val_days: int, test_days: int):
    
    final_time = data['Time'].max()
    session_last_time = data.groupby("SessionId")["Time"].max()

    session_in_train = session_last_time[session_last_time < final_time - dt.timedelta(test_days + val_days)].index
    session_in_test = session_last_time[session_last_time >= final_time - dt.timedelta(test_days)].index
    session_in_val = session_last_time[
        (session_last_time < final_time - dt.timedelta(test_days)) &
        (session_last_time >= final_time - dt.timedelta(test_days + val_days))
    ].index

    train = data[data['SessionId'].isin(session_in_train)]
    test  = data[data['SessionId'].isin(session_in_test)]
    valid = data[data['SessionId'].isin(session_in_val)]

    return train, valid, test

In [13]:
# 세션 생성
data_session = make_sessions(data, gap_minutes=1440, min_len=2)
data_session

Unnamed: 0,UserId,ItemId,Rating,Time,SessionNo,SessionId
5200,36,2269,5,2002-12-22 08:05:50,1,36_1
5256,36,1701,4,2002-12-22 08:05:50,1,36_1
5184,36,2694,3,2002-12-22 08:08:14,1,36_1
5364,36,3786,4,2002-12-22 08:08:41,1,36_1
5262,36,2369,4,2002-12-22 08:09:24,1,36_1
...,...,...,...,...,...,...
984731,5950,3948,4,2003-02-27 18:13:57,1,5950_1
984682,5950,3578,4,2003-02-27 18:14:30,1,5950_1
984475,5950,3793,3,2003-02-27 18:15:10,1,5950_1
984660,5950,3555,2,2003-02-27 18:15:37,1,5950_1


In [14]:
# train, val, test 분할
train, val, test = split_by_date(data_session, val_days=6, test_days=6)

In [15]:
def stats_info(data: pd.DataFrame, status: str):
    print(f'* {status} Set Stats Info\n'
          f'\t Events: {len(data)}\n'
          f'\t Sessions: {data["SessionId"].nunique()}\n'
          f'\t Items: {data["ItemId"].nunique()}\n'
          f'\t First Time : {data["Time"].min()}\n'
          f'\t Last Time : {data["Time"].max()}\n')

In [16]:
stats_info(train, 'train')
stats_info(val, 'valid')
stats_info(test, 'test')

* train Set Stats Info
	 Events: 7687
	 Sessions: 556
	 Items: 2191
	 First Time : 2002-09-01 19:41:39
	 Last Time : 2003-02-16 01:27:14

* valid Set Stats Info
	 Events: 226
	 Sessions: 22
	 Items: 205
	 First Time : 2003-02-16 21:23:05
	 Last Time : 2003-02-22 06:41:03

* test Set Stats Info
	 Events: 319
	 Sessions: 22
	 Items: 297
	 First Time : 2003-02-20 17:58:16
	 Last Time : 2003-02-28 17:49:50



[TRY 1]
- gap min = 30
- cut off = 30
- val, test = 1

* train Set Stats Info
	 Events: 1381
	 Sessions: 92
	 Items: 941
	 First Time : 2003-01-29 19:36:56
	 Last Time : 2003-02-26 16:18:03

* valid Set Stats Info
	 Events: 10
	 Sessions: 4
	 Items: 10
	 First Time : 2003-02-27 04:30:55
	 Last Time : 2003-02-27 14:22:07

* test Set Stats Info
	 Events: 45
	 Sessions: 3
	 Items: 45
	 First Time : 2003-02-27 17:44:58
	 Last Time : 2003-02-28 17:49:50



[TRY 2]
- gap min = 1440 
- cut off = 180
- val, test = 6

* train Set Stats Info
	 Events: 7687
	 Sessions: 556
	 Items: 2191
	 First Time : 2002-09-01 19:41:39
	 Last Time : 2003-02-16 01:27:14

* valid Set Stats Info
	 Events: 226
	 Sessions: 22
	 Items: 205
	 First Time : 2003-02-16 21:23:05
	 Last Time : 2003-02-22 06:41:03

* test Set Stats Info
	 Events: 319
	 Sessions: 22
	 Items: 297
	 First Time : 2003-02-20 17:58:16
	 Last Time : 2003-02-28 17:49:50

In [17]:
# train set에 없는 아이템이 val, test기간에 생길 수 있으므로 train data를 기준으로 인덱싱합니다.
id2idx = {item_id : index for index, item_id in enumerate(train['ItemId'].unique())}

def indexing(data, id2idx):
    data['item_idx'] = data['ItemId'].map(lambda x: id2idx.get(x, -1))  # id2idx에 없는 아이템은 모르는 값(-1) 처리 해줍니다.
    return data

train = indexing(train, id2idx)
val = indexing(val, id2idx)
test = indexing(test, id2idx)

In [18]:
val  = val[val["item_idx"] != -1]
test = test[test["item_idx"] != -1]

In [19]:
def drop_short_sessions(df, min_len=2):
    sizes = df.groupby("SessionId").size()
    keep = sizes[sizes >= min_len].index
    return df[df["SessionId"].isin(keep)].copy()

train = drop_short_sessions(train, min_len=2)
val   = drop_short_sessions(val,   min_len=2)
test  = drop_short_sessions(test,  min_len=2)

In [20]:
stats_info(train, 'train')
stats_info(val, 'valid')
stats_info(test, 'test')

* train Set Stats Info
	 Events: 7687
	 Sessions: 556
	 Items: 2191
	 First Time : 2002-09-01 19:41:39
	 Last Time : 2003-02-16 01:27:14

* valid Set Stats Info
	 Events: 206
	 Sessions: 19
	 Items: 186
	 First Time : 2003-02-16 21:23:05
	 Last Time : 2003-02-22 06:41:03

* test Set Stats Info
	 Events: 269
	 Sessions: 18
	 Items: 251
	 First Time : 2003-02-20 17:58:16
	 Last Time : 2003-02-28 17:49:08



In [21]:
save_path = data_path / 'ratings_processed'
save_path.mkdir(parents=True, exist_ok=True)

train.to_pickle(save_path / 'train.pkl')
val.to_pickle(save_path / 'valid.pkl')
test.to_pickle(save_path / 'test.pkl')

### **[GRU 파이프라인]**

- Session Dataset

In [22]:
class SessionDataset:
    def __init__(self, data: pd.DataFrame):
        
        self.df = data.sort_values(["SessionId","Time"]).reset_index(drop=True) # SessionId, Time 기준 정렬
        self.click_offsets = self._get_click_offsets()
        self.session_idx = np.arange(self.df["SessionId"].nunique())            # SessionId indexing

    def _get_click_offsets(self):
        
        sizes = self.df.groupby("SessionId").size().values  
        offsets = np.zeros(len(sizes) + 1, dtype=np.int32)
        offsets[1:] = sizes.cumsum()
        return offsets

- Session DataLoader

In [23]:
class SessionDataLoader:
    def __init__(self, dataset: SessionDataset, batch_size=50):
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        start, end, mask, last_session, finished = self._initialize()
        while not finished:
            min_len = (end - start).min() - 1
            if min_len < 0:  # 안전 가드
                min_len = 0
            for i in range(min_len):
                inp = self.dataset.df["item_idx"].values[start + i]
                tgt = self.dataset.df["item_idx"].values[start + i + 1]
                yield inp, tgt, mask
            start, end, mask, last_session, finished = self._update_status(start, end, min_len, last_session, finished)

    def _initialize(self):
        eff_bs = min(self.batch_size, self.dataset.session_idx.size)
        first_iters = np.arange(eff_bs)
        last_session = eff_bs - 1
        start = self.dataset.click_offsets[self.dataset.session_idx[first_iters]]
        end   = self.dataset.click_offsets[self.dataset.session_idx[first_iters] + 1]
        mask = np.array([], dtype=np.int64)
        finished = (eff_bs == 0)
        return start, end, mask, last_session, finished

    def _update_status(self, start: np.ndarray, end: np.ndarray, min_len: int, last_session: int, finished: bool):
        start += min_len
        mask = np.arange(len(start))[(end - start) == 1]  # 종료될 세션
        for i, idx in enumerate(mask, start=1):
            new_session = last_session + i
            if new_session > self.dataset.session_idx[-1]:
                finished = True
                break
            start[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session]]
            end[idx]   = self.dataset.click_offsets[self.dataset.session_idx[new_session] + 1]
        last_session += len(mask)
        return start, end, mask, last_session, finished

### **[모델 구조]**

- Evaluation metric

In [24]:
def mrr_k(pred_sorted_idx_row: np.ndarray, truth: int, k: int) -> float:
    hit = np.where(pred_sorted_idx_row[:k] == truth)[0]
    return 1.0 / (hit[0] + 1) if len(hit) > 0 else 0.0

def recall_k(pred_sorted_idx_row: np.ndarray, truth: int, k: int) -> int:
    return int(truth in pred_sorted_idx_row[:k])

- Model Architecture

In [25]:
def create_model(num_items: int, batch_size: int, hsz=50, drop_rate=0.1, lr=1e-3):
    inputs = Input(batch_shape=(batch_size, 1, num_items))
    gru, _ = GRU(hsz, stateful=True, return_state=True, name="GRU")(inputs)
    x = Dropout(drop_rate)(gru)
    logits = Dense(num_items, activation="softmax")(x)
    model = Model(inputs, logits)
    model.compile(loss=categorical_crossentropy, optimizer=Adam(lr), metrics=["accuracy"])
    return model

def reset_hidden_states(model: tensorflow.keras.Model, mask_idx: np.ndarray):
    gru = model.get_layer(name="GRU")
    h = gru.states[0]                      
    h_np = h.numpy()
    for idx in mask_idx:
        h_np[idx, :] = 0
    gru.reset_states(states=h_np)

In [26]:
class Args:
    def __init__(self, tr, val, test, id2idx, batch_size=512, hsz=50, drop_rate=0.1, lr=1e-3, epochs=3, k=20):
        self.tr = tr
        self.val = val
        self.test = test
        self.num_items = len(id2idx)  
        self.batch_size = batch_size
        self.hsz = hsz
        self.drop_rate = drop_rate
        self.lr = lr
        self.epochs = epochs
        self.k = k

def train_model(model, args: Args):
    train_dataset = SessionDataset(args.tr)
    train_loader = SessionDataLoader(train_dataset, batch_size=args.batch_size)

    for epoch in range(1, args.epochs + 1):
        pbar = tqdm(train_loader, desc=f"Train Epoch {epoch}", mininterval=1)
        for feat, target, mask in pbar:
            reset_hidden_states(model, mask)

            x = to_categorical(feat, num_classes=args.num_items)
            x = np.expand_dims(x, axis=1)  # (B, 1, num_items)
            y = to_categorical(target, num_classes=args.num_items)

            loss, acc = model.train_on_batch(x, y)
            pbar.set_postfix(loss=float(loss), acc=float(acc))

        r, m = get_metrics(args.val, model, args, args.k)
        print(f"[Val] Recall@{args.k}: {r:.4f} | MRR@{args.k}: {m:.4f}")

def get_metrics(data: pd.DataFrame, model, args: Args, k: int):
    ds = SessionDataset(data)
    loader = SessionDataLoader(ds, batch_size=args.batch_size)
    recalls, mrrs = [], []

    for feat, target, mask in tqdm(loader, desc="Evaluation", mininterval=1):
        reset_hidden_states(model, mask)

        x = to_categorical(feat, num_classes=args.num_items)
        x = np.expand_dims(x, axis=1)
        pred = model.predict(x, batch_size=args.batch_size, verbose=0)

        # 내림차순 정렬된 인덱스 (넘파이로)
        pred_sorted = tensorflow.argsort(pred, axis=-1, direction="DESCENDING").numpy()

        for i in range(len(feat)):
            recalls.append(recall_k(pred_sorted[i], int(target[i]), k))
            mrrs.append(mrr_k(pred_sorted[i], int(target[i]), k))
    return float(np.mean(recalls)), float(np.mean(mrrs))

def test_model(model, args: Args):
    r, m = get_metrics(args.test, model, args, args.k)
    print(f"[Test] Recall@{args.k}: {r:.4f} | MRR@{args.k}: {m:.4f}")

### **[학습 및 평가]**

In [28]:
args = Args(train, val, test, id2idx, batch_size=16, hsz=50, drop_rate=0.1, lr=1e-3, epochs=3, k=20)
model = create_model(num_items=args.num_items, batch_size=args.batch_size, hsz=args.hsz, drop_rate=args.drop_rate, lr=args.lr)

train_model(model, args)
test_model(model, args)

Train Epoch 1: 411it [00:04, 102.32it/s, acc=0, loss=7.58]     
Evaluation: 2it [00:00, 15.59it/s]


[Val] Recall@20: 0.0938 | MRR@20: 0.0084


Train Epoch 2: 411it [00:03, 118.06it/s, acc=0, loss=7.37]     
Evaluation: 2it [00:00, 54.05it/s]


[Val] Recall@20: 0.0625 | MRR@20: 0.0050


Train Epoch 3: 411it [00:03, 120.07it/s, acc=0.0625, loss=7.07]
Evaluation: 2it [00:00, 55.55it/s]


[Val] Recall@20: 0.0000 | MRR@20: 0.0000


Evaluation: 1it [00:00, 50.00it/s]

[Test] Recall@20: 0.0625 | MRR@20: 0.0063



