In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json

In [2]:
user_df = pd.read_csv("dataset/user.csv")
item_df = pd.read_csv("dataset/item.csv")
interaction_df = pd.read_csv("dataset/preferences.csv")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
model = AutoModel.from_pretrained("klue/bert-base")

In [4]:
def embed_texts(texts, batch_size=64):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Texts"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu())
    return torch.cat(embeddings, dim=0).tolist()

In [5]:
def add_column_names_to_values(df):
    return df.apply(lambda row: " ".join([f"{col}:{row[col]}" for col in df.columns]), axis=1)

In [None]:
user_df["사용자 ID"] = ["사용자_" + str(idx) for idx in range(len(user_df))]
user_text_columns = ["성별", "키", "몸무게", "거주지역", "선호하는 장소", "트래킹 난이도", "위도", "경도"]
user_texts = add_column_names_to_values(user_df[user_text_columns]).tolist()
user_features_tensor = embed_texts(user_texts)

Embedding Texts:  62%|██████████████▏        | 968/1563 [13:46<09:01,  1.10it/s]

In [None]:
item_df["산책로 ID"] = ["산책로_" + str(idx) for idx in range(len(item_df))]
item_text_columns = ["행정구역명", "코스 난이도", "코스 경관 카테고리", "소요시간", "주소", "위도", "경도"]
item_texts = add_column_names_to_values(item_df[item_text_columns]).tolist()
item_features_tensor = embed_texts(item_texts)

In [None]:
user_id_map = {uid: idx for idx, uid in enumerate(user_df["사용자 ID"].unique())}
item_id_map = {iid: idx for idx, iid in enumerate(item_df["산책로 ID"].unique())}
interaction_df["user_numeric_id"] = interaction_df["사용자_ID"].map(user_id_map)
interaction_df["item_numeric_id"] = interaction_df["산책로_ID"].map(item_id_map)

In [None]:
random_seed = 42
train_df, test_df = train_test_split(interaction_df, test_size=0.2, random_state=random_seed)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=random_seed)

In [None]:
preprocessed_data = {
    "user_features": user_features,
    "item_features": item_features,  # 같은 방식으로 item_features도 확인
    "train": train_df.to_dict(orient="records"),
    "val": val_df.to_dict(orient="records"),
    "test": test_df.to_dict(orient="records"),
    "user_id_map": user_id_map,
    "item_id_map": item_id_map,
}

with open("dataset/preprocessed_data.json", "w", encoding="utf-8") as f:
    json.dump(preprocessed_data, f, indent=4, ensure_ascii=False)

print("Preprocessed data saved successfully as JSON!")