In [3]:
from langchain_community.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# Embedding

Steps:
1. Create a list of city documents.
2. Initialize the embedding model.
3. Create the vector database.
4. Query the vector database.

In [36]:
# Step 1: create documents
docs = []
for city, country, city_id in [('北京', 'China', '101010100'),
                                ('上海', 'China', '101020100'),
                                ('成都', 'China', '101270101'),
                                ('Washington', 'US', 'ws'),
                                ('Tokyo', 'Japan', 'tk')]:
    docs.append(Document(page_content=city, metadata={'country': country, 'city_id': city_id}))

In [6]:
# Step 2: Initialize embedding model
# Remember to change the location of the model to your local path.
embeddings = HuggingFaceEmbeddings(model_name='D:/workspace/model/bge-reranker-v2-m3',
                                    model_kwargs={'device': 'cpu'})

No sentence-transformers model found with name D:/workspace/model/bge-reranker-v2-m3. Creating a new one with mean pooling.
Some weights of XLMRobertaModel were not initialized from the model checkpoint at D:/workspace/model/bge-reranker-v2-m3 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Step 3: Create vector database
vector_db = FAISS.from_documents(docs, embeddings)

In [50]:
r = embeddings.embed_documents(['成都'])
print(len(r[0]))

sentences = [
    '日本的首都是哪里',
    '美国的首都是哪个城市？',
    '"窗含西岭千秋雪，门泊东吴万里船"是描写哪个城市？',
    '”晓看红湿处，花重锦官城“描写哪个城市？',
    '明朝的京城',
]

1024


In [55]:
def alignment(str1, space, align = 'left'):
    length = len(str1.encode('gb2312'))
    space = space - length if space >=length else 0
    if align == 'left':
        str1 = str1 + ' ' * space
    elif align == 'right':
        str1 = ' '* space +str1
    elif align == 'center':
        str1 = ' ' * (space //2) +str1 + ' '* (space - space // 2)
    return str1


print('*' * 60)
print('%50s %25s' % ('Query', 'Similarity Research'))
print('*' * 60)
for sentence in sentences:
    results = vector_db.similarity_search_with_score(sentence, k=1)
    result = results[0][0].page_content
    print(alignment(sentence, 50, 'right'), alignment(result, 25, 'right'), sep='')

************************************************************
                                             Query       Similarity Research
************************************************************
                                  日本的首都是哪里               Washington
                            美国的首都是哪个城市？               Washington
  "窗含西岭千秋雪，门泊东吴万里船"是描写哪个城市？               Washington
          ”晓看红湿处，花重锦官城“描写哪个城市？               Washington
                                        明朝的京城                     上海


通过输出的字符串，基于相关性，查找最接近的城市

# Reranking

In [40]:
from FlagEmbedding import FlagReranker

In [None]:
# Remember to change the location of the model to your local path.
RERANKER = FlagReranker(
            model_name_or_path='D:/workspace/model/bge-reranker-v2-m3',
            devices=['cpu'],
            trust_remote_code=True
        )

In [57]:
print('*' * 60)
print('%50s %25s' % ('Query', 'Similarity Research'))
print('*' * 60)
for sentence in sentences:
    results = vector_db.similarity_search_with_score(sentence, k=5)

    pair_list = []
    for result in results:
        pair_list.append((sentence, result[0].page_content))
    scores = RERANKER.compute_score(pair_list, normalize=True)
    sorted_index = sorted(range(len(scores)), key=lambda k: scores[k], reverse=True)

    result = results[sorted_index[0]][0].page_content
    print(alignment(sentence, 50, 'right'), alignment(result, 25, 'right'), sep='')

************************************************************
                                             Query       Similarity Research
************************************************************
                                  日本的首都是哪里                    Tokyo
                            美国的首都是哪个城市？               Washington
  "窗含西岭千秋雪，门泊东吴万里船"是描写哪个城市？                     成都
          ”晓看红湿处，花重锦官城“描写哪个城市？                     成都
                                        明朝的京城                     北京
