## 3. Embedding Models 示例

In [34]:
"""
LangChain 0.3 Embedding Models 完整示例
包含所有主要嵌入模型和高级用法
"""

import os
import numpy as np
import asyncio
from typing import List, Dict, Any, Optional
from concurrent.futures import ThreadPoolExecutor
import time

# 核心嵌入模型导入
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import (
    HuggingFaceEmbeddings,
    HuggingFaceInstructEmbeddings,
    SentenceTransformerEmbeddings,
    CohereEmbeddings,
    BedrockEmbeddings
)
from langchain_core.documents import Document

### Ollama嵌入模型示例

In [41]:

def ollama_embeddings_example():
    """Ollama嵌入模型示例 - 本地部署"""
    print("=" * 60)
    print("1. Ollama嵌入模型示例")
    print("=" * 60)

    try:
        # 1.1 基础Ollama嵌入
        print("\n1.1 基础Ollama嵌入模型")
        embeddings = OllamaEmbeddings(
            base_url="http://localhost:11434",
            model="nomic-embed-text:latest",  # 推荐的嵌入模型
        )

        # 测试文本
        texts = [
            "人工智能是计算机科学的分支",
            "机器学习是AI的重要组成部分",
            "深度学习使用神经网络进行学习",
            "自然语言处理让计算机理解人类语言",
            "今天天气很好，适合出门散步"
        ]

        # 生成文档嵌入
        print("生成文档嵌入...")
        doc_embeddings = embeddings.embed_documents(texts)
        print(f"文档嵌入数量: {len(doc_embeddings)}")
        print(f"嵌入向量维度: {len(doc_embeddings[0])}")

        # 生成查询嵌入
        query = "什么是人工智能技术？"
        query_embedding = embeddings.embed_query(query)
        print(f"查询嵌入维度: {len(query_embedding)}")

        # 1.2 计算相似度
        print("\n1.2 语义相似度计算")

        def cosine_similarity(a: List[float], b: List[float]) -> float:
            """计算余弦相似度"""
            a_np = np.array(a)
            b_np = np.array(b)
            return np.dot(a_np, b_np) / (np.linalg.norm(a_np) * np.linalg.norm(b_np))

        print(f"查询: '{query}'")
        print("与各文档的相似度:")
        similarities = []
        for i, text in enumerate(texts):
            similarity = cosine_similarity(query_embedding, doc_embeddings[i])
            similarities.append((text, similarity))
            print(f"{i+1}. {similarity:.4f} - {text}")

        # 排序显示最相似的文档
        similarities.sort(key=lambda x: x[1], reverse=True)
        print(f"\n最相似文档: {similarities[0][0]} (相似度: {similarities[0][1]:.4f})")

        # # 1.3 不同Ollama模型对比
        # print("\n1.3 不同Ollama嵌入模型对比")
        # ollama_models = [
        #     "nomic-embed-text",
        #     "mxbai-embed-large",
        #     "all-minilm"
        # ]
        #
        # for model_name in ollama_models:
        #     try:
        #         model_embeddings = OllamaEmbeddings(
        #             base_url="http://localhost:11434",
        #             model=model_name
        #         )
        #         test_embedding = model_embeddings.embed_query("测试文本")
        #         print(f"{model_name}: 维度 {len(test_embedding)}")
        #     except Exception as e:
        #         print(f"{model_name}: 不可用 ({str(e)[:50]}...)")

        return embeddings

    except Exception as e:
        print(f"Ollama嵌入模型初始化失败: {e}")
        print("请确保Ollama服务正在运行并安装了嵌入模型")
        print("安装命令: ollama pull nomic-embed-text")
        return None
ollama_embeddings_example()

1. Ollama嵌入模型示例

1.1 基础Ollama嵌入模型
生成文档嵌入...
文档嵌入数量: 5
嵌入向量维度: 768
查询嵌入维度: 768

1.2 语义相似度计算
查询: '什么是人工智能技术？'
与各文档的相似度:
1. 0.8601 - 人工智能是计算机科学的分支
2. 0.5262 - 机器学习是AI的重要组成部分
3. 0.5862 - 深度学习使用神经网络进行学习
4. 0.7732 - 自然语言处理让计算机理解人类语言
5. 0.5400 - 今天天气很好，适合出门散步

最相似文档: 人工智能是计算机科学的分支 (相似度: 0.8601)


OllamaEmbeddings(model='nomic-embed-text:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)

### OpenAI嵌入模型示例

In [None]:

def openai_embeddings_example():
    """OpenAI嵌入模型示例"""
    print("\n" + "=" * 60)
    print("2. OpenAI嵌入模型示例")
    print("=" * 60)

    try:
        # 2.1 基础OpenAI嵌入
        print("\n2.1 基础OpenAI嵌入")
        embeddings = OpenAIEmbeddings(
            model="text-embedding-3-small",  # 新版本模型
            api_key=os.getenv("OPENAI_API_KEY"),
            dimensions=1536  # 可选：指定维度
        )

        texts = [
            "Artificial intelligence is a branch of computer science",
            "Machine learning is a subset of AI",
            "Deep learning uses neural networks",
            "Natural language processing enables computers to understand human language"
        ]

        doc_embeddings = embeddings.embed_documents(texts)
        query_embedding = embeddings.embed_query("What is artificial intelligence?")

        print(f"OpenAI嵌入维度: {len(doc_embeddings[0])}")

        # 2.2 不同OpenAI模型对比
        print("\n2.2 OpenAI模型对比")
        openai_models = [
            ("text-embedding-3-small", 1536),
            ("text-embedding-3-large", 3072),
            ("text-embedding-ada-002", 1536)
        ]

        for model_name, default_dim in openai_models:
            try:
                model_embeddings = OpenAIEmbeddings(
                    model=model_name,
                    api_key=os.getenv("OPENAI_API_KEY")
                )
                test_embedding = model_embeddings.embed_query("test")
                print(f"{model_name}: 维度 {len(test_embedding)}")
            except Exception as e:
                print(f"{model_name}: 不可用 ({str(e)[:50]}...)")

        # 2.3 自定义维度（仅支持text-embedding-3系列）
        print("\n2.3 自定义嵌入维度")
        try:
            custom_embeddings = OpenAIEmbeddings(
                model="text-embedding-3-large",
                dimensions=1024,  # 自定义维度
                api_key=os.getenv("OPENAI_API_KEY")
            )
            custom_embedding = custom_embeddings.embed_query("自定义维度测试")
            print(f"自定义维度嵌入: {len(custom_embedding)}")
        except Exception as e:
            print(f"自定义维度失败: {e}")

        return embeddings

    except Exception as e:
        print(f"OpenAI嵌入模型失败: {e}")
        print("请设置OPENAI_API_KEY环境变量")
        return None

### HuggingFace嵌入模型示例

In [None]:

def huggingface_embeddings_example():
    """HuggingFace嵌入模型示例"""
    print("\n" + "=" * 60)
    print("3. HuggingFace嵌入模型示例")
    print("=" * 60)

    # 3.1 基础HuggingFace嵌入
    print("\n3.1 基础HuggingFace嵌入")
    try:
        # 使用预训练的sentence-transformers模型
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'},  # 或 'cuda' 如果有GPU
            encode_kwargs={'normalize_embeddings': True}  # 标准化嵌入
        )

        texts = [
            "这是一个测试文档",
            "人工智能技术发展迅速",
            "机器学习算法很重要"
        ]

        doc_embeddings = embeddings.embed_documents(texts)
        query_embedding = embeddings.embed_query("AI技术")

        print(f"HuggingFace嵌入维度: {len(doc_embeddings[0])}")

        # 3.2 中文优化模型
        print("\n3.2 中文优化嵌入模型")
        chinese_models = [
            "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
            "sentence-transformers/distiluse-base-multilingual-cased",
            "BAAI/bge-small-zh-v1.5"  # 中文优化模型
        ]

        for model_name in chinese_models:
            try:
                chinese_embeddings = HuggingFaceEmbeddings(
                    model_name=model_name,
                    model_kwargs={'device': 'cpu'}
                )
                test_embedding = chinese_embeddings.embed_query("中文测试")
                print(f"{model_name}: 维度 {len(test_embedding)}")
            except Exception as e:
                print(f"{model_name}: 加载失败 ({str(e)[:50]}...)")

        # 3.3 指令优化嵌入
        print("\n3.3 指令优化嵌入模型")
        try:
            instruct_embeddings = HuggingFaceInstructEmbeddings(
                model_name="hkunlp/instructor-xl",
                model_kwargs={'device': 'cpu'}
            )

            # 使用指令前缀
            query_instruction = "为这个查询找到最相关的文档: "
            doc_instruction = "这是一个关于技术的文档: "

            instruct_query = instruct_embeddings.embed_query(
                query_instruction + "人工智能应用"
            )
            print(f"指令嵌入维度: {len(instruct_query)}")

        except Exception as e:
            print(f"指令嵌入模型加载失败: {e}")

        return embeddings

    except Exception as e:
        print(f"HuggingFace嵌入模型失败: {e}")
        return None

### SentenceTransformers嵌入模型示例

In [None]:

def sentence_transformers_example():
    """SentenceTransformers嵌入模型示例"""
    print("\n" + "=" * 60)
    print("4. SentenceTransformers嵌入模型示例")
    print("=" * 60)

    try:
        # 4.1 多语言模型
        print("\n4.1 多语言SentenceTransformers")
        multilingual_embeddings = SentenceTransformerEmbeddings(
            model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        )

        # 多语言测试
        multilingual_texts = [
            "Hello, how are you?",
            "你好，你好吗？",
            "Hola, ¿cómo estás?",
            "Bonjour, comment allez-vous?"
        ]

        multi_embeddings = multilingual_embeddings.embed_documents(multilingual_texts)
        print(f"多语言嵌入维度: {len(multi_embeddings[0])}")

        # 计算跨语言相似度
        english_query = multilingual_embeddings.embed_query("greeting")
        chinese_query = multilingual_embeddings.embed_query("问候")

        def cosine_similarity(a, b):
            return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

        cross_lang_similarity = cosine_similarity(english_query, chinese_query)
        print(f"跨语言相似度 (greeting vs 问候): {cross_lang_similarity:.4f}")

        # 4.2 专业领域模型
        print("\n4.2 专业领域嵌入模型")
        domain_models = [
            "sentence-transformers/all-mpnet-base-v2",  # 通用
            "sentence-transformers/msmarco-distilbert-base-v4",  # 搜索优化
            "sentence-transformers/nli-mpnet-base-v2"  # 自然语言推理
        ]

        for model_name in domain_models:
            try:
                domain_embeddings = SentenceTransformerEmbeddings(model_name=model_name)
                test_embedding = domain_embeddings.embed_query("domain test")
                print(f"{model_name.split('/')[-1]}: 维度 {len(test_embedding)}")
            except Exception as e:
                print(f"{model_name}: 不可用")

        return multilingual_embeddings

    except Exception as e:
        print(f"SentenceTransformers失败: {e}")
        return None

### 云端嵌入模型示例

In [None]:

def cloud_embeddings_example():
    """云端嵌入模型示例"""
    print("\n" + "=" * 60)
    print("5. 云端嵌入模型示例")
    print("=" * 60)

    # 5.1 Cohere嵌入
    print("\n5.1 Cohere嵌入模型")
    try:
        cohere_embeddings = CohereEmbeddings(
            cohere_api_key=os.getenv("COHERE_API_KEY"),
            model="embed-english-v3.0"  # 或 embed-multilingual-v3.0
        )

        cohere_texts = ["AI technology", "Machine learning algorithms"]
        cohere_embeds = cohere_embeddings.embed_documents(cohere_texts)
        print(f"Cohere嵌入维度: {len(cohere_embeds[0])}")

    except Exception as e:
        print(f"Cohere嵌入失败: {e}")

    # 5.2 AWS Bedrock嵌入
    print("\n5.2 AWS Bedrock嵌入模型")
    try:
        bedrock_embeddings = BedrockEmbeddings(
            credentials_profile_name="default",
            region_name="us-east-1",
            model_id="amazon.titan-embed-text-v1"
        )

        bedrock_texts = ["Cloud computing", "Serverless architecture"]
        bedrock_embeds = bedrock_embeddings.embed_documents(bedrock_texts)
        print(f"Bedrock嵌入维度: {len(bedrock_embeds[0])}")

    except Exception as e:
        print(f"Bedrock嵌入失败: {e}")

### 嵌入模型性能对比

In [None]:

def embedding_performance_comparison():
    """嵌入模型性能对比"""
    print("\n" + "=" * 60)
    print("6. 嵌入模型性能对比")
    print("=" * 60)

    # 测试文本
    test_texts = [
        "人工智能技术正在快速发展",
        "机器学习算法在各个领域都有应用",
        "深度学习模型需要大量的训练数据",
        "自然语言处理让计算机理解人类语言",
        "计算机视觉技术可以识别图像中的物体"
    ]

    test_query = "AI技术的应用领域"

    # 定义要测试的模型
    models_to_test = []

    # Ollama模型
    try:
        ollama_model = OllamaEmbeddings(
            base_url="http://localhost:11434",
            model="nomic-embed-text"
        )
        models_to_test.append(("Ollama-nomic", ollama_model))
    except:
        pass

    # HuggingFace模型
    try:
        hf_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        models_to_test.append(("HF-MiniLM", hf_model))
    except:
        pass

    # 性能测试
    results = {}

    for model_name, model in models_to_test:
        try:
            print(f"\n测试 {model_name}...")

            # 测试文档嵌入时间
            start_time = time.time()
            doc_embeddings = model.embed_documents(test_texts)
            doc_time = time.time() - start_time

            # 测试查询嵌入时间
            start_time = time.time()
            query_embedding = model.embed_query(test_query)
            query_time = time.time() - start_time

            # 计算相似度
            similarities = []
            for doc_emb in doc_embeddings:
                sim = np.dot(query_embedding, doc_emb) / (
                    np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb)
                )
                similarities.append(sim)

            results[model_name] = {
                "dimension": len(doc_embeddings[0]),
                "doc_time": doc_time,
                "query_time": query_time,
                "avg_similarity": np.mean(similarities),
                "max_similarity": np.max(similarities)
            }

        except Exception as e:
            results[model_name] = {"error": str(e)}

    # 显示结果
    print("\n性能对比结果:")
    print(f"{'模型':<15} {'维度':<8} {'文档时间(s)':<12} {'查询时间(s)':<12} {'平均相似度':<12} {'最高相似度':<12}")
    print("-" * 80)

    for model_name, result in results.items():
        if "error" not in result:
            print(f"{model_name:<15} {result['dimension']:<8} {result['doc_time']:<12.4f} "
                  f"{result['query_time']:<12.4f} {result['avg_similarity']:<12.4f} "
                  f"{result['max_similarity']:<12.4f}")
        else:
            print(f"{model_name:<15} 错误: {result['error'][:50]}...")

### 高级嵌入技术

In [None]:

def advanced_embedding_techniques():
    """高级嵌入技术"""
    print("\n" + "=" * 60)
    print("7. 高级嵌入技术")
    print("=" * 60)

    # 7.1 嵌入缓存
    print("\n7.1 嵌入缓存机制")

    class CachedEmbeddings:
        """带缓存的嵌入模型"""

        def __init__(self, base_embeddings):
            self.base_embeddings = base_embeddings
            self.cache = {}

        def embed_query(self, text: str) -> List[float]:
            if text in self.cache:
                print(f"缓存命中: {text[:30]}...")
                return self.cache[text]

            embedding = self.base_embeddings.embed_query(text)
            self.cache[text] = embedding
            print(f"新计算: {text[:30]}...")
            return embedding

        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            embeddings = []
            for text in texts:
                embeddings.append(self.embed_query(text))
            return embeddings

    # 使用缓存嵌入
    try:
        base_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        cached_model = CachedEmbeddings(base_model)

        # 第一次计算
        test_texts = ["AI技术", "机器学习", "AI技术"]  # 重复文本
        embeddings1 = cached_model.embed_documents(test_texts)

        # 第二次计算（应该使用缓存）
        embeddings2 = cached_model.embed_documents(test_texts)

    except Exception as e:
        print(f"缓存嵌入示例失败: {e}")

    # 7.2 批量处理优化
    print("\n7.2 批量处理优化")

    def batch_embed_documents(embeddings_model, texts: List[str], batch_size: int = 32):
        """批量处理嵌入"""
        all_embeddings = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            print(f"处理批次 {i//batch_size + 1}: {len(batch)} 个文档")

            batch_embeddings = embeddings_model.embed_documents(batch)
            all_embeddings.extend(batch_embeddings)

        return all_embeddings

    # 7.3 异步嵌入处理
    print("\n7.3 异步嵌入处理")

    async def async_embed_documents(embeddings_model, texts: List[str]):
        """异步处理嵌入"""
        loop = asyncio.get_event_loop()

        # 将文本分组
        chunk_size = len(texts) // 4 + 1
        tasks = []

        for i in range(0, len(texts), chunk_size):
            chunk = texts[i:i + chunk_size]
            task = loop.run_in_executor(
                None,
                embeddings_model.embed_documents,
                chunk
            )
            tasks.append(task)

        # 等待所有任务完成
        results = await asyncio.gather(*tasks)

        # 合并结果
        all_embeddings = []
        for result in results:
            all_embeddings.extend(result)

        return all_embeddings

### 嵌入质量评估

In [None]:

def embedding_quality_evaluation():
    """嵌入质量评估"""
    print("\n" + "=" * 60)
    print("8. 嵌入质量评估")
    print("=" * 60)

    # 8.1 语义相似度测试
    print("\n8.1 语义相似度测试")

    # 定义测试用例
    similarity_tests = [
        ("人工智能", "AI技术", "高相似度"),
        ("机器学习", "深度学习", "中等相似度"),
        ("计算机", "苹果", "低相似度"),
        ("狗", "猫", "中等相似度"),
        ("汽车", "飞机", "低相似度")
    ]

    try:
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        )

        print("语义相似度测试结果:")
        for text1, text2, expected in similarity_tests:
            emb1 = embeddings.embed_query(text1)
            emb2 = embeddings.embed_query(text2)

            similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
            print(f"{text1} vs {text2}: {similarity:.4f} ({expected})")

    except Exception as e:
        print(f"语义相似度测试失败: {e}")

    # 8.2 聚类质量评估
    print("\n8.2 聚类质量评估")

    def evaluate_clustering_quality(embeddings_model, texts: List[str], labels: List[str]):
        """评估聚类质量"""
        try:
            from sklearn.cluster import KMeans
            from sklearn.metrics import adjusted_rand_score

            # 生成嵌入
            embeddings = embeddings_model.embed_documents(texts)
            embeddings_array = np.array(embeddings)

            # 执行聚类
            n_clusters = len(set(labels))
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            predicted_labels = kmeans.fit_predict(embeddings_array)

            # 计算调整兰德指数
            ari_score = adjusted_rand_score(labels, predicted_labels)
            print(f"聚类质量 (ARI): {ari_score:.4f}")

            return ari_score

        except ImportError:
            print("需要安装scikit-learn: pip install scikit-learn")
        except Exception as e:
            print(f"聚类评估失败: {e}")

### 自定义嵌入包装器

In [None]:

def custom_embedding_wrapper():
    """自定义嵌入包装器"""
    print("\n" + "=" * 60)
    print("9. 自定义嵌入包装器")
    print("=" * 60)

    from langchain_core.embeddings import Embeddings

    class MultiModelEmbeddings(Embeddings):
        """多模型集成嵌入"""

        def __init__(self, models: List[Embeddings], weights: Optional[List[float]] = None):
            self.models = models
            self.weights = weights or [1.0] * len(models)

            # 标准化权重
            total_weight = sum(self.weights)
            self.weights = [w / total_weight for w in self.weights]

        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            """集成多个模型的文档嵌入"""
            all_embeddings = []

            # 获取每个模型的嵌入
            model_embeddings = []
            for model in self.models:
                embeddings = model.embed_documents(texts)
                model_embeddings.append(embeddings)

            # 加权平均
            for i in range(len(texts)):
                combined_embedding = np.zeros(len(model_embeddings[0][i]))

                for j, (embeddings, weight) in enumerate(zip(model_embeddings, self.weights)):
                    combined_embedding += np.array(embeddings[i]) * weight

                all_embeddings.append(combined_embedding.tolist())

            return all_embeddings

        def embed_query(self, text: str) -> List[float]:
            """集成多个模型的查询嵌入"""
            embeddings = self.embed_documents([text])
            return embeddings[0]

    # 使用示例
    try:
        # 创建多个基础模型
        model1 = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

        # 如果有多个模型可用
        models = [model1]  # 可以添加更多模型
        weights = [1.0]    # 对应的权重

        multi_embeddings = MultiModelEmbeddings(models, weights)

        test_text = "多模型嵌入测试"
        result = multi_embeddings.embed_query(test_text)
        print(f"多模型嵌入维度: {len(result)}")

    except Exception as e:
        print(f"多模型嵌入失败: {e}")

In [None]:

def main():
    """运行所有嵌入模型示例"""
    print("🚀 LangChain 0.3 Embedding Models 完整示例")
    print("=" * 80)

    # 运行所有示例
    ollama_embeddings = ollama_embeddings_example()
    openai_embeddings = openai_embeddings_example()
    hf_embeddings = huggingface_embeddings_example()
    st_embeddings = sentence_transformers_example()
    cloud_embeddings_example()
    embedding_performance_comparison()
    advanced_embedding_techniques()
    embedding_quality_evaluation()
    custom_embedding_wrapper()

    print("\n🎉 所有嵌入模型示例运行完成！")

    # 最佳实践建议
    print("\n📋 嵌入模型选择建议:")
    print("1. 本地部署：Ollama + nomic-embed-text")
    print("2. 云端服务：OpenAI text-embedding-3-small")
    print("3. 开源方案：HuggingFace sentence-transformers")
    print("4. 中文优化：BAAI/bge-small-zh-v1.5")
    print("5. 多语言：paraphrase-multilingual-mpnet-base-v2")
    print("6. 高性能：text-embedding-3-large")

if __name__ == "__main__":
    main()

In [42]:
# 3. Embedding Models 示例
def embedding_models_example():
    """嵌入模型示例"""
    print("\n" + "=" * 60)
    print("3. Embedding Models 嵌入模型示例")
    print("=" * 60)

    # 3.1 Ollama嵌入模型
    print("\n3.1 Ollama嵌入模型")
    try:
        embeddings = OllamaEmbeddings(
            base_url="http://localhost:11434",
            model="nomic-embed-text"  # 或使用其他嵌入模型
        )

        # 测试文本
        texts = [
            "人工智能是计算机科学的分支",
            "机器学习是AI的子集",
            "深度学习使用神经网络",
            "今天天气很好"
        ]

        # 生成嵌入向量
        text_embeddings = embeddings.embed_documents(texts)
        query_embedding = embeddings.embed_query("什么是人工智能？")

        print(f"文档嵌入数量: {len(text_embeddings)}")
        print(f"嵌入向量维度: {len(text_embeddings[0])}")
        print(f"查询嵌入维度: {len(query_embedding)}")

        # 计算相似度
        import numpy as np

        def cosine_similarity(a, b):
            return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

        print("\n相似度计算:")
        for i, text in enumerate(texts):
            similarity = cosine_similarity(query_embedding, text_embeddings[i])
            print(f"'{text}' 相似度: {similarity:.4f}")

        return embeddings

    except Exception as e:
        print(f"Ollama嵌入模型初始化失败: {e}")
        print("请确保Ollama服务正在运行并安装了嵌入模型")
        return None
# 3. 嵌入模型
embeddings = embedding_models_example()


3. Embedding Models 嵌入模型示例

3.1 Ollama嵌入模型
文档嵌入数量: 4
嵌入向量维度: 768
查询嵌入维度: 768

相似度计算:
'人工智能是计算机科学的分支' 相似度: 0.8551
'机器学习是AI的子集' 相似度: 0.6135
'深度学习使用神经网络' 相似度: 0.5818
'今天天气很好' 相似度: 0.5851
