In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

import math
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split

In [6]:
# 파일 경로
user_word_df = '/content/drive/MyDrive/user_word_df.csv'

# CSV 파일을 DataFrame으로 읽기
user_word_df = pd.read_csv(user_word_df)
user_word_df.drop(columns=user_word_df.columns[0], axis=1, inplace=True)

# DataFrame 확인
print(user_word_df)

    Unnamed: 0  1  2    3  4  5  6  7  8  9  ...  4870  4871  4872  4873  \
0        사회기사1  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   
1        사회기사2  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   
2        사회기사3  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   
3        사회기사4  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   
4        사회기사5  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   
..         ... .. ..  ... .. .. .. .. .. ..  ...   ...   ...   ...   ...   
335    국제기사336  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   
336   스포츠기사337  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   
337   스포츠기사338  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   
338    사회기사339  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   
339   스포츠기사340  0  0  0.0  0  0  0  0  0  0  ...     0     0     0     0   

     4874  4875  4876  4877  4878  4879  
0       0     0     0     0     0     0  
1  

In [8]:
# 단어 추천 방법 2 : Matrix Factorization
# 행렬을 두 개의 저차원 행렬로 분해하여 각각 사용자와 단어의 잠재적 특성을 나타냄
# 잠재적 특성을 기반으로 사용자와 단어 사이의 관계를 예측

# 행렬 분해 모델 클래스 정의
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, latent_dim, dropout_rate=0.8, l2=0.01):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, latent_dim)
        self.item_embedding = nn.Embedding(num_items, latent_dim)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        self.dropout = nn.Dropout(dropout_rate)
        self.l2 = l2

        nn.init.normal_(self.user_embedding.weight, mean=0.0, std=0.01)
        nn.init.normal_(self.item_embedding.weight, mean=0.0, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

    def forward(self, user_indices, item_indices):
        user_latent = self.dropout(self.user_embedding(user_indices))
        item_latent = self.dropout(self.item_embedding(item_indices))
        user_bias = self.user_bias(user_indices).squeeze()
        item_bias = self.item_bias(item_indices).squeeze()

        prediction = torch.sum(user_latent * item_latent, dim=1) + user_bias + item_bias
        return prediction

    def loss(self, prediction, target):
        mse_loss = F.mse_loss(prediction, target.float())
        l2_loss = sum(torch.norm(param) for param in self.parameters())
        total_loss = mse_loss + self.l2 * l2_loss
        return total_loss

class UserWordDataset(Dataset):
    def __init__(self, user_word_matrix):
        self.user_word_matrix = user_word_matrix.values
        self.num_users, self.num_items = user_word_matrix.shape

    def __len__(self):
        return self.num_users * self.num_items

    def __getitem__(self, idx):
        user_id = idx // self.num_items
        item_id = idx % self.num_items
        rating = self.user_word_matrix[user_id, item_id]
        return user_id, item_id, torch.tensor(rating, dtype=torch.float)

In [9]:
from sklearn.preprocessing import LabelEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset = UserWordDataset(user_word_df)

df = user_word_df

def split_data(dataset, batch_size=64):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_dataset = UserWordDataset(train_df)
    test_dataset = UserWordDataset(test_df)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

train_loader, test_loader = split_data(dataset)

# 베이지안 최적화를 위한 목적 함수 정의
def objective(latent_dim, lr, dropout_rate, l2):
    print(f"하이퍼파라미터: latent_dim={latent_dim}, lr={lr}, dropout_rate={dropout_rate}, l2={l2}")
    num_users, num_items = df.shape
    model = MatrixFactorization(num_users, num_items, int(latent_dim), dropout_rate, l2).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # 학습 과정
    for epoch in range(5):  # 에포크 수 조정 가능
        epoch_losses = []  # 에포크별 평균 loss를 계산하기 위해 사용
        for user_ids, item_ids, ratings in train_loader:
            user_ids, item_ids, ratings = user_ids.to(device), item_ids.to(device), ratings.to(device)
            optimizer.zero_grad()
            outputs = model(user_ids, item_ids)
            loss = model.loss(outputs, ratings)
            loss.backward()
            optimizer.step()
            epoch_losses.append(loss.item())
        epoch_avg_loss = np.mean(epoch_losses)
        print(f"에포크 {epoch+1}, 평균 Loss: {epoch_avg_loss}")

    # 검증 과정 - 여기서는 간단하게 훈련 데이터로 대체
    model.eval()
    losses = []
    with torch.no_grad():
        for user_ids, item_ids, ratings in test_loader:
            user_ids, item_ids, ratings = user_ids.to(device), item_ids.to(device), ratings.to(device)
            outputs = model(user_ids, item_ids)
            loss = model.loss(outputs, ratings)
            losses.append(loss.item())

    avg_loss = np.mean(losses)
    print(f"검증 Loss: {avg_loss}")
    return -avg_loss  # 최소화 대신 최대화를 위해 음수 반환

# 베이지안 최적화 실행
optimizer = BayesianOptimization(
    f=objective,
    pbounds={
        "latent_dim": (50, 200),
        "lr": (1e-4, 1e-2),
        "dropout_rate": (0.1, 0.5),
        "l2": (1e-5, 1e-3),
    },
    random_state=1,
)
optimizer.maximize(init_points=2, n_iter=5)

# 최적의 하이퍼파라미터 출력
print(optimizer.max)

|   iter    |  target   | dropou... |    l2     | latent... |    lr     |
-------------------------------------------------------------------------
하이퍼파라미터: latent_dim=50.017156222601734, lr=0.003093092469055214, dropout_rate=0.2668088018810296, l2=0.0007231212485077365
에포크 1, 평균 Loss: 2.3692015162096665
에포크 2, 평균 Loss: 2.3253165266306226
에포크 3, 평균 Loss: 2.3213448965502597
에포크 4, 평균 Loss: 2.319120397033755
에포크 5, 평균 Loss: 2.318329994701408
검증 Loss: 2.25722716134864
| [0m1        [0m | [0m-2.257   [0m | [0m0.2668   [0m | [0m0.0007231[0m | [0m50.02    [0m | [0m0.003093 [0m |
하이퍼파라미터: latent_dim=77.93903170665064, lr=0.003521051197726173, dropout_rate=0.15870235632684523, l2=0.00010141520882110982
에포크 1, 평균 Loss: 2.3865921436333184
에포크 2, 평균 Loss: 2.3736514520429646
에포크 3, 평균 Loss: 2.375516441701849
에포크 4, 평균 Loss: 2.374292550673093
에포크 5, 평균 Loss: 2.3748604029656746
검증 Loss: 2.2864408362066593
| [0m2        [0m | [0m-2.286   [0m | [0m0.1587   [0m | [0m0.0001014[0m | 

In [10]:
# 최적 하이퍼파라미터 사용
best_params = optimizer.max['params']
print("최적의 하이퍼파라미터:", best_params)

latent_dim = int(best_params['latent_dim'])
lr = best_params['lr']
dropout_rate = best_params['dropout_rate']
l2 = best_params['l2']

# 최적 하이퍼파라미터로 모델 초기화
num_users, num_items = df.shape
model = MatrixFactorization(num_users, num_items, latent_dim, dropout_rate, l2).to(device)

# 옵티마이저 설정
optimizer = optim.Adam(model.parameters(), lr=lr)

# 학습 과정
num_epochs = 10  # 최종 모델 학습을 위해 에포크 수 조정
for epoch in range(num_epochs):
    model.train()
    epoch_losses = []
    for user_ids, item_ids, ratings in train_loader:
        user_ids, item_ids, ratings = user_ids.to(device), item_ids.to(device), ratings.to(device)
        optimizer.zero_grad()
        outputs = model(user_ids, item_ids)
        loss = model.loss(outputs, ratings)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    epoch_avg_loss = np.mean(epoch_losses)
    print(f"에포크 {epoch+1}, 평균 Loss: {epoch_avg_loss}")

model_path = 'matrix_factorization_model.pth'
torch.save(model.state_dict(), model_path)
print("모델이 '{}'에 저장되었습니다.".format(model_path))

# 테스트 과정
model.eval()
test_losses = []
with torch.no_grad():
    for user_ids, item_ids, ratings in test_loader:
        user_ids, item_ids, ratings = user_ids.to(device), item_ids.to(device), ratings.to(device)
        outputs = model(user_ids, item_ids)
        loss = model.loss(outputs, ratings)
        test_losses.append(loss.item())

avg_test_loss = np.mean(test_losses)
print(f"테스트 데이터에 대한 평균 Loss: {avg_test_loss}")

최적의 하이퍼파라미터: {'dropout_rate': 0.5, 'l2': 0.001, 'latent_dim': 67.67865540758136, 'lr': 0.0001}
에포크 1, 평균 Loss: 3.6533404152728606
에포크 2, 평균 Loss: 2.1768567823615697
에포크 3, 평균 Loss: 2.084598619954662
에포크 4, 평균 Loss: 2.0804560221354227
에포크 5, 평균 Loss: 2.079886250848095
에포크 6, 평균 Loss: 2.079520409291547
에포크 7, 평균 Loss: 2.079174592707621
에포크 8, 평균 Loss: 2.0788387768948486
에포크 9, 평균 Loss: 2.0785204534451442
에포크 10, 평균 Loss: 2.0781882823426714
모델이 'matrix_factorization_model.pth'에 저장되었습니다.
테스트 데이터에 대한 평균 Loss: 2.078276413276598


In [11]:
model_info = {
    'state_dict': model.state_dict(),
    'hyperparams': {
        'latent_dim': latent_dim,
        'lr': lr,
        'dropout_rate': dropout_rate,
        'l2': l2
    },
    'num_users': num_users,
    'num_items': num_items
}

# 정보를 파일로 저장
model_path = 'matrix_factorization_model_with_hyperparams.pth'
torch.save(model_info, model_path)
print("모델과 하이퍼파라미터가 '{}'에 저장되었습니다.".format(model_path))

모델과 하이퍼파라미터가 'matrix_factorization_model_with_hyperparams.pth'에 저장되었습니다.
