In [2]:
# ===== 1) 충돌 제거 & 필요한 버전 설치 =====
%pip -q uninstall -y pinecone pinecone-client langchain-pinecone >/dev/null
%pip -q install --no-cache-dir "pinecone[asyncio]>=6,<8" "langchain-openai>=0.1.7" "langchain>=0.2.14" "openai>=1.30.0" tiktoken >/dev/null

# ===== 2) 환경변수(Colab userdata 사용 시) =====
import os, glob
from pathlib import Path
try:
    from google.colab import userdata
    os.environ["OPENAI_API_KEY"]   = userdata.get("FINAL_OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY","")
    os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY") or os.getenv("PINECONE_API_KEY","")
    # LangSmith (선택)
    for k in ["LANGSMITH_TRACING","LANGSMITH_ENDPOINT","LANGSMITH_API_KEY","LANGSMITH_PROJECT"]:
        v = userdata.get(k) or os.getenv(k)
        if v: os.environ[k] = v
except Exception:
    pass

assert os.environ.get("OPENAI_API_KEY"),   "❌ OPENAI_API_KEY가 없습니다."
assert os.environ.get("PINECONE_API_KEY"), "❌ PINECONE_API_KEY가 없습니다."

# ===== 3) 로컬 이름 충돌 방지 (./pinecone.py 등) =====
_conflicts = [p for p in glob.glob("./pinecone*") if Path(p).exists()]
if _conflicts:
    raise RuntimeError(f"❌ 작업 폴더 이름 충돌: {_conflicts}  (이름 변경 후 재실행)")

# ===== 4) 임포트 =====
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
print("OK: imports")


[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


AssertionError: ❌ OPENAI_API_KEY가 없습니다.

In [59]:
# ===== 설정 =====
INDEX_NAME   = "transcripts-test"
NAMESPACE    = "transcripts"
CLOUD, REGION = "aws", "us-east-1"
METRIC       = "cosine"

OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"  # 필요 시 "text-embedding-3-large"
OPENAI_LLM_MODEL       = "gpt-4o-mini"             # 또는 "gpt-4o"

MODEL_DIMS = {"text-embedding-3-small": 1536, "text-embedding-3-large": 3072}
EMBED_DIM  = MODEL_DIMS[OPENAI_EMBEDDING_MODEL]

# ===== 텍스트 전처리 & 청크 =====
import re, uuid

def clean_text(t: str) -> str:
    t = re.sub(r"[ \t]+\n", "\n", t)
    t = re.sub(r"\n{3,}", "\n\n", t).strip()
    t = t.replace("–", "-").replace("—", "-")
    return t

def split_sections(t: str):
    parts = re.split(r"\n\s*(\d\)\s*[^\n]+)\s*\n", t)
    docs = []
    if len(parts) > 1:
        for i in range(1, len(parts), 2):
            header = parts[i].strip()
            body   = parts[i+1].strip()
            section_name = re.sub(r"^\d\)\s*", "", header).strip()
            docs.append({"section": section_name, "text": body})
    else:
        docs.append({"section": "General", "text": t})
    return docs

def chunk_by_paragraph(t, max_chars=1600, overlap_chars=200):
    paras = [p.strip() for p in re.split(r"\n{2,}", t) if p.strip()]
    chunks, cur = [], ""
    for p in paras:
        if len(cur) + len(p) + 2 <= max_chars:
            cur = (cur + "\n\n" + p).strip()
        else:
            if cur: chunks.append(cur); cur = p
    if cur: chunks.append(cur)
    if overlap_chars and len(chunks) > 1:
        for k in range(1, len(chunks)):
            prefix = chunks[k-1][-overlap_chars:]
            chunks[k] = (prefix + "\n" + chunks[k]).strip()
    return chunks

# ===== occasion 자동 태깅 =====
def infer_occasion(text: str):
    t = text.lower()
    tags = set()
    if re.search(r"\b(office|work|commute|promotion|slacks|shirt|tweed|mary jane)\b", t):
        tags.add("work")
    if re.search(r"\b(festival|concert)\b", t):
        tags.add("festival")
    if re.search(r"\b(travel|airport)\b", t):
        tags.add("travel")
    if re.search(r"\b(casual|weekend|hobo bag)\b", t):
        tags.add("casual")
    return list(tags or {"general"})

In [60]:
# ===== Pinecone 초기화 & 인덱스 생성/접속 =====
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
if INDEX_NAME not in {i.name for i in pc.list_indexes()}:
    pc.create_index(
        name=INDEX_NAME, dimension=EMBED_DIM, metric=METRIC,
        spec=ServerlessSpec(cloud=CLOUD, region=REGION),
    )
    print(f"✅ Created index: {INDEX_NAME}")
index = pc.Index(INDEX_NAME)

# ===== 임베딩 인스턴스 =====
emb = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL)
print("OK: Pinecone & Embeddings ready")

✅ Created index: transcripts-test
OK: Pinecone & Embeddings ready


In [None]:
from pathlib import Path

def ingest_files(file_paths, *, namespace=NAMESPACE, add_snippet=True):
    total_vectors, doc_map = 0, {}
    for f in file_paths:
        path = Path(f)
        assert path.exists(), f"❌ 파일 없음: {path.resolve()}"

        text = clean_text(path.read_text(encoding="utf-8"))
        docs = split_sections(text)

        # 레코드 구성(스니펫/occasion 메타 포함)
        records = []
        for d in docs:
            for ch in chunk_by_paragraph(d["text"], max_chars=1600, overlap_chars=200):
                md = {
                    "section": d["section"],
                    "season": "summer",
                    "exposure": "non_revealing",
                    "source": str(path),
                    "occasion": infer_occasion(ch)
                }
                if add_snippet:
                    md["snippet"] = ch[:220].replace("\n", " ")
                records.append({"text": f"Section: {d['section']}\n\n{ch}", "metadata": md})

        # 업서트
        if not records:
            continue
        doc_id = str(uuid.uuid4())
        texts  = [r["text"] for r in records]
        metas  = [r["metadata"] for r in records]
        ids    = [f"{doc_id}-{i}" for i in range(len(records))]

        vectors = []
        BATCH = 64
        for s in range(0, len(texts), BATCH):
            embs = emb.embed_documents(texts[s:s+BATCH])
            for j, vec in enumerate(embs):
                k = s + j
                vectors.append({
                    "id": ids[k],
                    "values": vec,
                    "metadata": {**metas[k], "doc_id": doc_id, "chunk_id": k, "lang": "en"},
                })

        index.upsert(vectors=vectors, namespace=namespace)
        total_vectors += len(vectors)
        doc_map[doc_id] = str(path.name)
        print(f"✅ Upserted {len(vectors)} vectors from '{path.name}' (ns='{namespace}')")
    print(f"🎯 Done. total_vectors={total_vectors}, files={len(file_paths)}, namespace='{namespace}'")
    return {"total_vectors": total_vectors, "doc_map": doc_map}

In [None]:
result = ingest_files(["transcript_sample.txt", "transcript_sample2.txt"])
result

✅ Upserted 8 vectors from 'transcript_sample.txt' (ns='transcripts')
✅ Upserted 8 vectors from 'transcript_sample2.txt' (ns='transcripts')
🎯 Done. total_vectors=16, files=2, namespace='transcripts'


{'total_vectors': 16,
 'doc_map': {'763da065-c7aa-4369-92cc-15df778fad3b': 'transcript_sample.txt',
  '59bd4f03-ae4b-441d-9a6b-766abeaed1e4': 'transcript_sample2.txt'}}

In [64]:
def pinecone_search(query_text, k=5, season="summer", occasion=None, section=None, namespace=NAMESPACE):
    qvec = emb.embed_query(query_text)
    _filter = {}
    if season:   _filter["season"] = {"$eq": season}
    if occasion: _filter["occasion"] = {"$eq": occasion}
    if section:  _filter["section"] = {"$eq": section}

    res = index.query(
        vector=qvec, top_k=k, include_values=False, include_metadata=True,
        namespace=namespace, filter=_filter or None
    )
    matches = res.get("matches", []) or []
    for i, m in enumerate(matches, 1):
        md = m.get("metadata", {}) or {}
        print(f"\n[{i}] score={m.get('score'):.4f}  section={md.get('section')}  occasion={md.get('occasion')} source={md.get('source')}")
        if "snippet" in md:
            print("snippet:", md["snippet"])
    return res

def build_context_from_matches(res, max_chunks=3, max_chars=800):
    chunks = []
    for m in (res.get("matches") or [])[:max_chunks]:
        md = m.get("metadata", {}) or {}
        body = md.get("snippet","")
        title = md.get("section","")
        chunks.append(f"[{title}] {body[:max_chars]}")
    return "\n\n".join(chunks)

# 예시: 여름 '출근룩'만 보기
res = pinecone_search("30대 여자 여름 출근룩", k=5, occasion="work")
docs_str = build_context_from_matches(res, max_chunks=3)
print("\n--- context preview ---\n", docs_str)



[1] score=0.3157  section=General  occasion=['work'] source=transcript_sample2.txt
snippet: legant vibe. Those big shoulder “work bags” can weigh the look down. You don’t need new clothes or items-if you mind just a few **key points**, you’ll look like the best-dressed guest after the bride. How about that? Sum

[2] score=0.3157  section=General  occasion=['work'] source=transcript_sample2.txt
snippet: legant vibe. Those big shoulder “work bags” can weigh the look down. You don’t need new clothes or items-if you mind just a few **key points**, you’ll look like the best-dressed guest after the bride. How about that? Sum

[3] score=0.2985  section=Pure & Clean  occasion=['work', 'casual', 'festival'] source=transcript_sample.txt
snippet: ace fabric bag. Don’t stuff it-part of the charm is just carrying it as is. Same outfit, but change the shoes and bag and it’s totally different-honestly, you can even get a hip look out of this base. Thinking summer fes

[4] score=0.2985  section=Pure 

In [65]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

# docs_str 가 미리 정의되어 있어야 합니다. (검색 결과 컨텍스트 문자열)
assert 'docs_str' in globals() and isinstance(docs_str, str) and len(docs_str) > 0, "docs_str 가 비어있거나 없습니다."

prompt = PromptTemplate.from_template("""
[ROLE]
You are a fashion coordinator and product curator. You must follow the rules and output format below, and ground your answers in {context} whenever possible.

[INPUTS]
context: {context}
user query: {user_query}

[TASKS]
Select internally 10 product keywords from the context that best fit the user query, each as "keyword + 1–2 key attributes".
Combine the selected keywords to internally design 5 style matches (looks) that fit the query. Each look should have a coherent set of top/bottom/outer/shoes/bag–accessories and be season- and occasion-appropriate.
Pick the best 3 looks among the 5 and return ONLY a JSON array that follows the output format.

[REFERENCE — CONSULT (do not copy verbatim; adapt to the query)]
Example product keywords
- Linen jacket (light tone, short-sleeve/tweed options)
- Slacks / cotton pants (straight fit, white/beige tones)
- Skirt (long H-line / A-line mini)
- Blouse & summer knit (neckline/ruffle details)
- Dress (black or color-point, textured fabric)
- Shoes (flats, loafers, slim-toe heels/sandals)
- Belt (≈2 cm slim belt, accessory point)
- Tote bag (small structured handbag, light point color)
- Scarf (point item for black dress/blouse looks)
- Jewelry (earrings, neat pieces with some weight)

Example style matches (gist)
- Casual neat: blouse/black inner + denim/cotton pants + light linen/tweed jacket + loafers/flats + slim belt/tote
- Feminine classic: minimal-detail blouse/summer knit + long H-line or A-line mini + short-sleeve jacket/knit + slim-toe heels/sandals + earrings/scarf/tote
- Chic refined: high-neck knit/draped top + black straight slacks + collarless jacket/suit set + slim-toe shoes + mini tote/jewelry
- Modern minimal: solid top (white/black) + white slacks/cream cotton + beige jacket + loafers/mules + scarf/leather tote
- Point-focused: black dress/blouse + H-line skirt/dress + (outer optional) + heeled sandals + scarf/statement earrings

[SCORING & SELECTION]
- Fitness: alignment with season/occasion/exposure/tone in the user query.
- Practicality: substitutable items, styling difficulty, movement/photo situations.
- Harmony: color/material/silhouette balance; avoid excessive details.
- Diversity: best-3 should represent distinct concepts.
- Grounding: prefer combinations/rules mentioned in the context.

[CONSTRAINTS]
- No hallucination: prioritize keywords/looks grounded in the context. If context lacks specifics, backfill with safe, general choices; avoid exaggerated claims or brand mentions.
- Terminology normalization: colors (white/black/beige/light gray), materials (linen/cotton/tweed/knit/leather), fits (straight/wide/H/A) must be consistent.
- No repetition: avoid identical compositions/sentences across the best-3.
- Language: English only (all fields and reasons).
- Do NOT use Markdown, code fences, or comments. Output must be plain JSON only.

[OUTPUT FORMAT — JSON ONLY]
Return a JSON array using ONLY the keys below (exactly 3 objects). Keep key order; use double quotes for all keys/strings; no trailing commas.
[
  {{
    "look_name": "string",
    "catg_tops": "string",
    "top_color": "string",
    "top_material": "string",
    "catg_bottoms": "string",
    "bottom_color": "string",
    "bottom_material": "string",
    "reason": ["string", "string"]
  }},
  {{ ... second look ... }},
  {{ ... third look ... }}
]

[PROCESS NOTES]
- The 10 keywords and 5-look design steps are internal only and must NOT be printed.
- Ensure the best-3 have distinct concepts; include at least one context-based justification in each reason (e.g., "Light-tone linen jacket offers neatness and airflow for summer office/guest looks").
- If omitting outerwear is reasonable, state the condition in the reason (e.g., heat vs. indoor A/C).

[FINAL INSTRUCTION]
Follow the rules above and return ONLY the "JSON array (best-3)". Do not include Markdown, code blocks, pre/post text, or any extra content.
""")

llm = ChatOpenAI(model=OPENAI_LLM_MODEL, temperature=0)
output_parser = JsonOutputParser()
chain = prompt | llm | output_parser

question = "summer office work outfit"
answer = chain.invoke({"user_query": question, "context": docs_str})
print(answer)

[{'look_name': 'Chic Minimalist', 'catg_tops': 'blouse', 'top_color': 'white', 'top_material': 'cotton', 'catg_bottoms': 'slacks', 'bottom_color': 'beige', 'bottom_material': 'cotton', 'reason': ['The white cotton blouse provides a fresh, clean look suitable for summer office wear.', 'Beige slacks maintain a light tone, ensuring comfort and elegance.']}, {'look_name': 'Feminine Classic', 'catg_tops': 'summer knit', 'top_color': 'light gray', 'top_material': 'knit', 'catg_bottoms': 'A-line skirt', 'bottom_color': 'white', 'bottom_material': 'cotton', 'reason': ['Light gray summer knit adds a soft touch, perfect for warm weather.', 'The white A-line skirt enhances femininity while keeping the outfit airy.']}, {'look_name': 'Modern Professional', 'catg_tops': 'high-neck top', 'top_color': 'black', 'top_material': 'knit', 'catg_bottoms': 'straight slacks', 'bottom_color': 'light gray', 'bottom_material': 'cotton', 'reason': ['A black high-neck top offers sophistication and a polished appea