In [6]:
import numpy as np

def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray)->float:
  dot = np.dot(vec_a, vec_b)
  norm_a = np.linalg.norm(vec_a)
  norm_b = np.linalg.norm(vec_b)
  return dot / (norm_a*norm_b)

# openai embedding

In [7]:
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
from openai import OpenAI
openai_client = OpenAI()

In [19]:
def get_openai_embedding(text:str, model='text-embedding-3-small'):
  response = openai_client.embeddings.create(
    input = text, 
    model=model
  )
  return response.data[0].embedding

In [12]:
text_str = '안녕하세요'
emb_vector = get_openai_embedding(text_str)
len(emb_vector.embedding)

1536

In [13]:
# 임베딩모델 생성
from langchain_upstage import UpstageEmbeddings
import os
embeddings = UpstageEmbeddings(
  api_key=os.getenv('UPSTAGE_API_KEY'),
  model='solar-embedding-1-large'
)

In [14]:
#업스테이지 모델로 임베딩함수
def get_upstage_embedding(text: str, is_query: bool = False) -> np.ndarray:
    if is_query:
        vec = embeddings.embed_query(text)
    else:
        vec = embeddings.embed_documents([text])[0]  # 리스트에서 첫 번째 요소만 추출
    return np.array(vec)

In [15]:
# pip install -qU langchain-core langchain-upstage
import os

from langchain_upstage import UpstageEmbeddings
 
embeddings = UpstageEmbeddings(
    api_key= os.getenv('UPSTAGE_API_KEY'),
    model="embedding-query"
)
 
doc_result = embeddings.embed_documents(
    ["Sam is a teacher.", "This is another document"]
)
print(doc_result)
 
query_result = embeddings.embed_query("What does Sam do?")
print(query_result)

[[0.0164034403860569, 0.017108500003814697, -0.007777245249599218, 0.0241303239017725, 0.0033562302123755217, -0.006834766827523708, -0.014655179344117641, -0.010676625184714794, -0.015640825033187866, 0.006741238292306662, 0.01836034283041954, 0.00505053298547864, 0.007234061136841774, 0.01180616021156311, 0.02769879251718521, 0.021900031715631485, -0.021453972905874252, -0.0012671297881752253, -0.0029803181532770395, -0.016777554526925087, -0.024159101769328117, -0.00909383688122034, -0.011216212064027786, -0.005913871806114912, -0.010396040044724941, 0.016273939982056618, 0.0027626845985651016, -0.01197163388133049, 0.005136867053806782, 0.02151152864098549, 0.0056260921992361546, 0.014086814597249031, 0.002041436964645982, -0.009367227554321289, 0.010719791986048222, -0.012245024554431438, -0.0035684676840901375, 0.023713042959570885, -0.012453665025532246, 0.012870945036411285, -0.011201823130249977, -0.02001507580280304, -0.014058036729693413, 0.00010937875049421564, -0.016504162

In [16]:
len(query_result)

4096

# openai embedding 모델로 임베딩
get_openai_embedding에서 return 맨 뒤에 .embedding

In [1]:
texts = ['king', 'queen', 'slave', '왕']

In [20]:
openai_embeddings = {txt: get_openai_embedding(txt) for txt in texts}

In [21]:
cosine_similarity(openai_embeddings['queen'], openai_embeddings['king'])

np.float64(0.590601530239691)

# 업스테이 임베딩 모델로 임베딩

In [22]:
upstage_embeddings = {txt: get_upstage_embedding(txt) for txt in texts}

In [23]:
cosine_similarity(upstage_embeddings['queen'], upstage_embeddings['king'])

np.float64(0.6446692598284114)

# 왕의 비교

In [24]:
cosine_similarity(openai_embeddings['왕'], openai_embeddings['king'])

np.float64(0.5041175911748997)

In [25]:
cosine_similarity(upstage_embeddings['왕'], upstage_embeddings['king'])

np.float64(0.6962810723499372)

# 젬마2 모델로 임베딩 비교

# 올라마 임베딩
! pip install sentence-transformers

In [None]:
# pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.22.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->senten

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-upstage 0.7.3 requires tokenizers<0.21.0,>=0.20.0, but you have tokenizers 0.22.0 which is incompatible.


In [2]:
from langchain_community.chat_models import ChatOllama
llm_ollama = ChatOllama(model='gemma2')
response = llm_ollama.invoke('안녕? 네 소개를 2줄로 작성해')
response.content

  llm_ollama = ChatOllama(model='gemma2')


'안녕하세요! 저는 구글에서 훈련된 대형 언어 모델입니다.\n\n텍스트 생성, 번역, 요약 등 다양한 작업을 수행하며 사람들의 질문에 답하고 도움을 드리기 위해 노력합니다. 😊'

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-multilingual-gemma2')
gemma_embedding = model.encode(texts)
gemma_embedding

  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [32]:
import transformers, tokenizers
print(transformers.__version__)
print(tokenizers.__version__)


4.56.1
0.20.3


In [None]:
# pip install --upgrade tokenizers

Note: you may need to restart the kernel to use updated packages.
