<h3>라이브러리</h3>

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np, pandas as pd, torch, pickle
from collections import OrderedDict
import torch.nn.functional as F
from google.colab import drive
import os, csv, math
import gc
import re

In [None]:
# Google Drive 마운트 
drive.mount('/content/drive')

-------------------------------------------

In [None]:
# --- q번호 ↔ 문항 매핑 고정 ---
QLABEL_TO_QID = {
    #생략    
}
QID_TO_QLABEL = {v:k for k,v in QLABEL_TO_QID.items()}

# --- 유틸 ---
def normalize_qid_from_colname(colname: str):
    m = re.search(r'(\d+(?:-\d+)?)', str(colname))
    return f"문항{m.group(1)}" if m else None

def mean_vec(series: pd.Series) -> np.ndarray:
    X = np.stack(series.values)
    return X.mean(axis=0).astype(np.float32)

def vec_to_str(v: np.ndarray) -> str:
    return "[" + ",".join(f"{float(x):.6f}" for x in v) + "]"

def uniq_sorted_join(vals, sep="|"):
    u = sorted(set([str(v) for v in vals if pd.notna(v)]))
    return sep.join(u) if u else None

# --- 피클 로드 ---
PKL_PATHS = [
    "",
    "",
]

def load_one_pickle_flat(pkl_path: str, order="col_first") -> pd.DataFrame:
    with open(pkl_path, "rb") as f:
        rec = pickle.load(f)
    items = rec if isinstance(rec, list) else [rec]
    frames = []
    for it in items:
        ids  = np.asarray(it["id"])
        cols = list(it["columns"])
        emb  = it["embeddings"]
        if isinstance(emb, torch.Tensor):
            emb = emb.cpu().numpy()
        emb = np.asarray(emb)
        N, C, D = len(ids), len(cols), emb.shape[1]
        if order == "col_first":
            emb_3d = emb.reshape(C, N, D).transpose(1,0,2)
        else:
            emb_3d = emb.reshape(N,C,D)
        uid_vec = np.repeat(ids, C).astype(str)
        col_vec = np.tile(cols, N)
        embs_2d = emb_3d.reshape(N*C, D).astype(np.float32)
        frames.append(pd.DataFrame({"uid": uid_vec, "colname": col_vec, "embedding": list(embs_2d)}))
    return pd.concat(frames, ignore_index=True)

long_df = pd.concat([load_one_pickle_flat(p) for p in PKL_PATHS], ignore_index=True)
EMB_DIM = len(long_df.iloc[0]["embedding"])
long_df["qid_norm"] = long_df["colname"].apply(normalize_qid_from_colname)

# --- 카테고리 매핑 ---
CAT_MAP = [
    #생략
]
cat_rows = []
for row in CAT_MAP:
    for q in row["qids"]:
        cat_rows.append({"qid_norm": q, "main": row["main"], "sub": row["sub"]})
cat_df = pd.DataFrame(cat_rows).drop_duplicates()

# --- main/sub 붙이기 + q라벨 ---
item_df = long_df.merge(cat_df, on="qid_norm", how="left")
item_df["q"] = item_df["qid_norm"].map(QID_TO_QLABEL)

# --- 사용자 × sub 평균 ---
uid_sub_avg = (
    item_df.dropna(subset=["sub","q"])
           .groupby(["uid","main","sub"], as_index=False)
           .agg(
               sub_embedding=("embedding", mean_vec),
               qids_used=("q", uniq_sorted_join)
           )
)

# --- 벡터 직렬화 및 저장 ---
uid_sub_avg["dim"] = EMB_DIM
uid_sub_avg["sub_vec"] = uid_sub_avg["sub_embedding"].apply(vec_to_str)
final_df = uid_sub_avg[["uid","main","sub","qids_used","dim","sub_vec"]]

OUT_FILE = ""
final_df.to_csv(OUT_FILE, index=False, encoding="utf-8")

print("✅ 저장 완료:", OUT_FILE)
print(final_df.head(10).to_string(index=False))


In [None]:
# --- 1) 경로/설정 ---
INPUT_PATH = ""
OUT_DIR    = ""
N = 5

EXPECTED_HEADER = ["uid","main","sub","qids_used","dim","sub_vec"]

# --- 2) 1차 패스: uid별 행수 집계 ---

uid_counts = OrderedDict()
total_rows = 0

with open(INPUT_PATH, "r", encoding="utf-8", newline="") as f:
    reader = csv.reader(f)
    header = next(reader)
    if header != EXPECTED_HEADER:
        print("헤더가 예상과 다릅니다.")
        print(" - 파일 헤더:", header)
        print(" - 예상 헤더:", EXPECTED_HEADER)

    uid_idx = header.index("uid")
    for row in reader:
        uid = row[uid_idx]
        uid_counts[uid] = uid_counts.get(uid, 0) + 1
        total_rows += 1

if total_rows == 0:
    raise RuntimeError("데이터 행이 없습니다(헤더만 존재).")

uids = list(uid_counts.keys())
counts = [uid_counts[u] for u in uids]
total_uids = len(uids)
target = math.ceil(total_rows / N)

print(f"총 행수: {total_rows}, 고유 uid 수: {total_uids}, 목표 파트당 행수 ≈ {target}")

# --- 3) uid 경계 기준으로 5등분 경계 계산(그리디) ---
boundaries = [] 
acc = 0
start = 0
for part_idx in range(1, N):
    # 남은 파트 수
    parts_left = N - part_idx
    # 이번 파트의 이상적인 누적 목표
    part_target = (total_rows - acc) / (parts_left + 1)

    cur_sum = 0
    end = start - 1
    # 최소 1 uid는 들어가야 함
    i = start
    while i < total_uids:
        # 남은 uid 수로 인해 강제로 종료해야 하는 경우(각 파트에 최소 1 uid 보장)
        remaining_uids = total_uids - i
        if remaining_uids == parts_left:
            # 현재 uid 이전까지를 이 파트로 마감하고 나머지는 각 1개 이상 보장
            break

        cur_sum += counts[i]
        end = i
        i += 1

        if cur_sum >= part_target:
            break

    if end < start:
        # 안전장치: 최소 1 uid 넣기
        end = start

    boundaries.append(end)
    # 다음 파트 준비
    acc += sum(counts[start:end+1])
    start = end + 1

# 마지막 파트는 끝까지
boundaries.append(total_uids - 1)

# uid -> part 매핑
uid_to_part = {}
p = 0
for i, uid in enumerate(uids):
    while i > boundaries[p]:
        p += 1
    uid_to_part[uid] = p  # 0..N-1

# --- 4) 2차 패스: 실제 파일 쓰기(헤더 보존, 순서 유지, 그룹 경계 보존) ---
base = os.path.splitext(os.path.basename(INPUT_PATH))[0]
out_paths = [os.path.join(OUT_DIR, f"{base}_part{i}of{N}.csv") for i in range(1, N+1)]

writers = []
files = []
try:
    for i, path in enumerate(out_paths):
        f = open(path, "w", encoding="utf-8-sig", newline="") 
        w = csv.writer(f)
        w.writerow(header)
        writers.append(w)
        files.append(f)

    part_row_counts = [0]*N

    with open(INPUT_PATH, "r", encoding="utf-8", newline="") as src:
        reader = csv.reader(src)
        _ = next(reader)  
        uid_idx = header.index("uid")

        for row in reader:
            uid = row[uid_idx]
            part = uid_to_part.get(uid, N-1)
            writers[part].writerow(row)
            part_row_counts[part] += 1
finally:
    for f in files:
        f.close()

# --- 5) 요약 및 검증 출력 ---
for i, path in enumerate(out_paths, start=1):
    print(f"✅ Saved: {os.path.basename(path)} (데이터 행: {part_row_counts[i-1]})")

print(f"\n원본 총 행수: {total_rows}")
print(f"파트 합계    : {sum(part_row_counts)}")
if sum(part_row_counts) == total_rows:
    print("검증 성공: 총 합계가 원본과 일치합니다.")
else:
    print("검증 실패: 합계가 일치하지 않습니다. 파일을 확인하세요.")

part_uid_counts = [0]*N
seen = set()
for u in uids:
    part_uid_counts[uid_to_part[u]] += 1
print("파트별 uid 그룹 수:", part_uid_counts)