In [1]:
import numpy as np

def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray)->float:
  dot = np.dot(vec_a, vec_b)
  norm_a = np.linalg.norm(vec_a)
  norm_b = np.linalg.norm(vec_b)
  return dot / (norm_a*norm_b)

# openai embedding

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from openai import OpenAI
openai_client = OpenAI()

In [4]:
def get_openai_embedding(text:str, model='text-embedding-3-small'):
  response = openai_client.embeddings.create(
    input = text, 
    model=model
  )
  return response.data[0].embedding

In [6]:
text_str = '안녕하세요'
emb_vector = get_openai_embedding(text_str)
len(emb_vector)

1536

In [7]:
# 임베딩모델 생성
from langchain_upstage import UpstageEmbeddings
import os
embeddings = UpstageEmbeddings(
  api_key=os.getenv('UPSTAGE_API_KEY'),
  model='solar-embedding-1-large'
)

In [8]:
#업스테이지 모델로 임베딩함수
def get_upstage_embedding(text: str, is_query: bool = False) -> np.ndarray:
    if is_query:
        vec = embeddings.embed_query(text)
    else:
        vec = embeddings.embed_documents([text])[0]  # 리스트에서 첫 번째 요소만 추출
    return np.array(vec)

In [9]:
# pip install -qU langchain-core langchain-upstage
import os

from langchain_upstage import UpstageEmbeddings
 
embeddings = UpstageEmbeddings(
    api_key= os.getenv('UPSTAGE_API_KEY'),
    model="embedding-query"
)
 
doc_result = embeddings.embed_documents(
    ["Sam is a teacher.", "This is another document"]
)
print(doc_result)
 
query_result = embeddings.embed_query("What does Sam do?")
print(query_result)

[[0.016357421875, 0.0171356201171875, -0.0077972412109375, 0.024078369140625, 0.00342559814453125, -0.00682830810546875, -0.01465606689453125, -0.01067352294921875, -0.015625, 0.00673675537109375, 0.0183258056640625, 0.00507354736328125, 0.007244110107421875, 0.01177978515625, 0.02777099609375, 0.02191162109375, -0.0213623046875, -0.0012531280517578125, -0.0030117034912109375, -0.0167388916015625, -0.02410888671875, -0.0090789794921875, -0.01122283935546875, -0.005924224853515625, -0.01036834716796875, 0.0162811279296875, 0.002777099609375, -0.01197052001953125, 0.00518798828125, 0.02154541015625, 0.0056610107421875, 0.01412200927734375, 0.0020542144775390625, -0.0094146728515625, 0.01070404052734375, -0.01226806640625, -0.003635406494140625, 0.02374267578125, -0.012420654296875, 0.01287078857421875, -0.0112152099609375, -0.02001953125, -0.01410675048828125, 8.320808410644531e-05, -0.016510009765625, 0.012786865234375, 0.00691986083984375, -0.0008373260498046875, 0.004840850830078125, 

In [10]:
len(query_result)

4096

#openai embedding 모델로 임베딩

In [11]:
texts = ['king', 'queen', 'slave', '왕']

In [12]:
openai_embeddings = {txt: get_openai_embedding(txt) for txt in texts}

In [13]:
cosine_similarity(openai_embeddings['queen'], openai_embeddings['king'])

np.float64(0.590601530239691)

#업스테이 임베딩 모델로 임베딩

In [14]:
upstage_embeddings = {txt: get_upstage_embedding(txt) for txt in texts}

In [15]:
cosine_similarity(upstage_embeddings['queen'], upstage_embeddings['king'])

np.float64(0.6446770680612558)

# 왕의 비교

In [16]:
cosine_similarity(openai_embeddings['왕'], openai_embeddings['king'])

np.float64(0.5039512482944454)

In [17]:
cosine_similarity(upstage_embeddings['왕'], upstage_embeddings['king'])

np.float64(0.6960682511235363)

젬마2 모델로 임베딩 비교

# 올라마 임베딩 > huggingface 의 임베딩 모델

In [18]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.22.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->sente

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-upstage 0.7.3 requires tokenizers<0.21.0,>=0.20.0, but you have tokenizers 0.22.0 which is incompatible.


In [19]:
from langchain_community.chat_models import ChatOllama
llm_ollama = ChatOllama(model='gemma2')
response = llm_ollama.invoke('안녕? 네 소개를 2줄로 작성해')
response.content

  llm_ollama = ChatOllama(model='gemma2')


'안녕하세요! 저는 구글 AI에서 훈련된 대화형 AI입니다.\n\n다양한 질문에 답변하고, 이야기 나누고, 창의적인 작업을 도와드릴 수 있습니다.'

# OS 계열의 임베딩 모델 사용
허깅페이스의 임베딩 모델 사용 - transformers 라이브러리, GPU 기반 pytorch
토치쿠다 기반의 가상환경에서 실행 

In [1]:
from sentence_transformers import SentenceTransformer
import torch
#MODEL = 'BAAI/bge-multilingual-gemma2'
MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

  from tqdm.autonotebook import tqdm, trange
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


'cpu'

In [2]:
model = SentenceTransformer(MODEL, device=device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [3]:
texts = ['king', 'queen', 'slave', '왕']

In [4]:
model.max_seq_length=256
gemma_embedding = model.encode(texts, batch_size=64, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
gemma_embedding

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]


array([[-0.00158092,  0.10178691, -0.01695298, ..., -0.05791414,
        -0.04557932, -0.04427629],
       [ 0.06440255, -0.00555204,  0.02317658, ..., -0.05347716,
         0.00656337, -0.04465976],
       [-0.049297  ,  0.07381599, -0.00779097, ..., -0.08466545,
        -0.04895947, -0.06428003],
       [ 0.00818062,  0.07960667, -0.02314878, ..., -0.04931473,
        -0.04339726, -0.00256805]], shape=(4, 384), dtype=float32)