# Embeddings


### 向量相似度计算

In [3]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

from openai import OpenAI   # openai=1.10.0
client = OpenAI()

def cos_sim(a, b):
    """余弦距离 -- 越大越相似"""
    return dot(a, b)/(norm(a)*norm(b))
def l2(a, b):
    """欧式距离 -- 越小越相似"""
    x = np.asarray(a)-np.asarray(b)
    return norm(x)

def get_embeddings(texts, model="text-embedding-ada-002"):
    data = client.embeddings.create(input=texts, model=model).data
    return [x.embedding for x in data]

test_arr = ["我是一段测试文本"]
vs = get_embeddings(test_arr)[0]
print(vs[:10])
# print(len(vs))  # 1536


query = "国际争端"
documents = [
    "联合国就苏丹达尔富尔地区大规模暴力事件发出警告",
    "土耳其、芬兰、瑞典与北约代表将继续就瑞典“入约”问题进行谈判",
    "日本岐阜市陆上自卫队射击场内发生枪击事件 3人受伤",
    "我国首次在空间站开展舱外辐射生物学暴露实验",
    "国家游泳中心(水立方) : 恢复游泳、嬉水乐园等水上项目运营",
]

query_vec = get_embeddings([query])[0]
doc_vecs = get_embeddings(documents)
print("Cosine distance.")
print(cos_sim(query_vec, query_vec))
for vec in doc_vecs:
    print(cos_sim(query_vec, vec))

print("\nEuclidean distance:")
print(l2(query_vec, query_vec))
for vec in doc_vecs:
    print(l2(query_vec, vec))



[-0.02138707786798477, -0.010255279019474983, -0.001874403329566121, -0.008866332471370697, -0.017125843092799187, 0.0009591146954335272, -0.017247207462787628, 0.002894200151786208, -0.027590138837695122, -0.018312515690922737]
Cosine distance.
1.0000000000000002
0.8220653393211526
0.8294946584302412
0.7978978814131463
0.7933096993158312
0.7660214052216155

Euclidean distance:
0.0
0.5965478203204638
0.5839611825268289
0.6357705796658597
0.6429467960889444
0.684073960202615


In [1]:
!pip install chromadb

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/


### 基于相似度聚类

In [7]:
# from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from sklearn.cluster import KMeans, DBSCAN
import numpy as np

texts = [
    "这个多少钱",
    "啥价",
    "给我报个价",
    "我要红色的",
    "不要了",
    "算了",
    "来红的吧",
    "作罢",
    "价格介绍一下",
    "红的这个给我吧"
]
model = OpenAIEmbeddings(model='text-embedding-ada-002')
X = []
for t in texts:
    embedding = model.embed_query(t)
    X.append(embedding)
    
#clusters = KMeans(n_clusters=3, random_state=42, n_init="auto").fit(X)
clusters = DBSCAN(eps=0.55, min_samples=2).fit(X)
for i,t in enumerate(texts):
    print("{}\t{}".format(clusters.labels_[i],t))

0	这个多少钱
0	啥价
0	给我报个价
1	我要红色的
-1	不要了
-1	算了
1	来红的吧
-1	作罢
0	价格介绍一下
1	红的这个给我吧


In [5]:
!pip install -U langchain-openai

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Collecting langchain-openai
  Downloading https://mirrors.aliyun.com/pypi/packages/81/63/012be16114559243aabcc9ec570366df84591dc9f8f3c2349a398e9b3626/langchain_openai-0.0.8-py3-none-any.whl (32 kB)
Installing collected packages: langchain-openai
Successfully installed langchain-openai-0.0.8
