In [2]:
import glob
import json
import os

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.document_loaders import JSONLoader
from langchain_ollama import OllamaEmbeddings
from langchain_ollama.llms import OllamaLLM
from langchain.evaluation.qa import QAEvalChain

import pandas as pd

#_MODEL_NAME = "deepseek-r1:7b"
_MODEL_NAME = "qwen2.5:7b"

In [3]:
model = OllamaLLM(model=_MODEL_NAME, temperature=0)
eval_chain = QAEvalChain.from_llm(model)

In [4]:
with open("data/sanguo.txt") as f:
    sanguo_content = f.read()

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " ", "。", ""]
)
texts = text_splitter.split_text(sanguo_content)
texts[0]

'《三国演义》（精校版全本）作者：罗贯中\n \n\n内容简介\n\n\u3000\u3000《三国演义》由东汉末年黄巾起义末期开始描写，至西晋初期国家重归统一结束，以魏、蜀、吴三个政治、军事集团之间的形成演变，矛盾斗争为主线，最后由晋统一全国，国家重归统一。小说在广阔的社会历史背景上，展示出那个时代尖锐复杂又极具特色的政治军事冲突，在军事政治谋略方面，对后世产生了极其深远的影响。\n\n\u3000\u3000《三国演义》是中国第一部长篇章回体小说，中国小说由短篇发展至长篇的原因与说书有关。宋代讲故事的风气盛行，说书成为一种职业，说书人喜欢拿古代人物的故事作为题材来敷演，而陈寿撰、裴松之注的《三国志》里面的人物众多，以多个主人公做线索，事件纷繁，正是撰写故事的最好素材。三国故事某些零星片段原来在民间也已流传，加上说书人长期取材，内容越来越丰富，人物形象越来越饱满，最后由许多独立的故事逐渐组合而成长篇巨著。这些各自孤立的故事在社会上经过漫长时间口耳相传，最后得以加工、集合成书，成为中国第一部长篇章回体小说，这是一种了不起的集体创造，与由单一作者撰写完成的小说在形式上有所不同。《三国演义》对后来的小说相信有一定的启导作用。\n\n\n第一回 宴桃园豪杰三结义 斩黄巾英雄首立功\n\n\u3000\u3000滚滚长江东逝水，浪花淘尽英雄。是非成败转头空。青山依旧在，几度夕阳红。白发渔樵江渚上，惯看秋月春风。一壶浊酒喜相逢。古今多少事，都付笑谈中。——调寄《临江仙》\n\n\u3000\u3000话说天下大势，分久必合，合久必分。周末七国分争，并入于秦。及秦灭之后，楚、汉分争，又并入于汉。汉朝自高祖斩白蛇而起义，一统天下，后来光武中兴，传至献帝，遂分为三国。推其致乱之由，殆始于桓、灵二帝。桓帝禁锢善类，崇信宦官。及桓帝崩，灵帝即位，大将军窦武、太傅陈蕃共相辅佐。时有宦官曹节等弄权，窦武、陈蕃谋诛之，机事不密，反为所害，中涓自此愈横。'

In [10]:
len(texts[0])

772

In [13]:
len(texts)

775

In [4]:
from langchain_core.documents import Document

docs = [
    Document(page_content=text, metadata={"source": f"chunk-{i}"})
    for i, text in enumerate(texts[:10])
]

embedding_model = OllamaEmbeddings(model=_MODEL_NAME)
vector_index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embedding_model,
).from_documents(docs)



In [5]:
vector_index.vectorstore.similarity_search("张角", k=3)

[Document(metadata={'source': 'chunk-9'}, page_content='朱儁大喜，便令坚攻打南门，玄德打北门，朱儁打西门，留东门与贼走。孙坚首先登城，斩贼二十余人，贼众奔溃。赵弘飞马突槊，直取孙坚。坚从城上飞身夺弘槊，刺弘下马；却骑弘马，飞身往来杀贼。孙仲引贼突出北门，正迎玄德，无心恋战，只待奔逃。玄德张弓一箭，正中孙仲，翻身落马。朱儁大军随后掩杀，斩首数万级，降者不可胜计。南阳一路，十数郡皆平。\n\n\u3000\u3000儁班师回京，诏封为车骑将军，河南尹。儁表奏孙坚、刘备等功。坚有人情，除别郡司马上任去了。惟玄德听候日久，不得除授，三人郁郁不乐，上街闲行，正值郎中张钧车到。玄德见之，自陈功绩。钧大惊，随入朝见帝曰：“昔黄巾造反，其原皆由十常侍卖官鬻爵，非亲不用，非仇不诛，以致天下大乱。今宜斩十常侍，悬首南郊，遣使者布告天下，有功者重加赏赐，则四海自清平也。”十常侍奏帝曰：“张钧欺主。”帝令武士逐出张钧。十常侍共议：“此必破黄巾有功者，不得除授，故生怨言。权且教省家铨注微名，待后却再理会未晚。”因此玄德除授定州中山府安喜县尉，克日赴任。\n\n\u3000\u3000玄德将兵散回乡里，止带亲随二十余人，与关、张来安喜县中到任。署县事一月，与民秋毫无犯，民皆感化。到任之后，与关、张食则同桌，寝则同床。如玄德在稠人广坐，关、张侍立，终日不倦。\n\n\u3000\u3000到县未及四月，朝廷降诏，凡有军功为长吏者当沙汰。玄德疑在遣中。适督邮行部至县，玄德出郭迎接，见督邮施礼。督邮坐于马上，惟微以鞭指回答。关、张二公俱怒。及到馆驿，督邮南面高坐，玄德侍立阶下。良久，督邮问曰：“刘县尉是何出身？”玄德曰：“备乃中山靖王之后；自涿郡剿戮黄巾，大小三十余战，颇有微功，因得除今职。”督邮大喝曰：“汝诈称皇亲，虚报功绩！目今朝廷降诏，正要沙汰这等滥官污吏！”玄德喏喏连声而退。归到县中，与县吏商议。吏曰：“督邮作威，无非要贿赂耳。”玄德曰：“我与民秋毫无犯，那得财物与他？”次日，督邮先提县吏去，勒令指称县尉害民。玄德几番自往求免，俱被门役阻住，不肯放参。'),
 Document(metadata={'source': 'chunk-5'}, page_content='杀到天明，张梁、张宝引败残军士，夺路而走。忽见一彪军马，尽打红旗，当

In [6]:
from langchain.evaluation.qa import QAGenerateChain
example_gen_chain = QAGenerateChain.from_llm(model)
examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in docs[:3]]
)



In [7]:
len(examples)

3

In [9]:
for e in examples:
    print(e)

{'qa_pairs': {'query': '《三国演义》的作者是谁？', 'answer': '罗贯中。'}}
{'qa_pairs': {'query': "According to the document, what significant event happened on April 15th of the second year of Jianning during Emperor's reign?", 'answer': 'On April 15th of the second year of Jianning, a large green snake suddenly flew down from the ceiling and coiled itself on the throne while the emperor was about to sit in his seat. This event caused the emperor to faint, and all officials fled in panic.'}}
{'qa_pairs': {'query': '张角、张宝和张梁分别自称为哪位“将军”？', 'answer': '张角自称“天公将军”，张宝称“地公将军”，张梁称“人公将军”。'}}


In [13]:
example_data = [qa['qa_pairs'] for qa in examples]
eval_chain.evaluate(example_data, example_data,
                    question_key="query",
                    answer_key="answer",
                    prediction_key="answer",)

[{'results': 'CORRECT'}, {'results': 'GRADE: CORRECT'}, {'results': 'CORRECT'}]

In [21]:
eval_chain.evaluate(
    [
        {
            "query": "刘备的三弟是谁？张飞也叫张益德",
            "answer": "张飞",
        }
    ],
    [
        {
            #"query": "刘备的三弟是谁？",
            "answer": "张益德",
        }
    ],
    question_key="query",
    answer_key="answer",
    prediction_key="answer",
)

[{'results': 'GRADE: INCORRECT\n\nExplanation: The student\'s answer "张益德" is not factually accurate according to the true answer, which states that "张飞也叫张益德," meaning Zhang Fei is also known as Zhang Yi De. Therefore, while Zhang Yi De is a correct alias for Zhang Fei, the question specifically asks for who刘备的三弟是谁 (Rúdài de sān dì shì shéi), which should be answered with "张飞" (Zhang Fei).'}]

In [5]:
result = model("刘备的三弟是谁？")
result

  result = model("刘备的三弟是谁？")


'刘备的三弟是刘封。在《三国演义》中，刘封是刘备的养子，本名关平，字云长之子，因战功被刘备收为养子，并封为立义侯，故改名为刘封。他在刘备集团中有一定的地位和军事才能，在攻打东吴时不幸战死。'

In [6]:
model("刘备的结义兄弟中，排行第三的是谁？")

'刘备的结义兄弟中，排行第三的是关羽。在《三国演义》中，刘备、关羽和张飞是在桃园中结为异姓兄弟，其中刘备居长，关羽次之，张飞排第三。不过需要注意的是，在正史《三国志》中并没有关于桃园结义的记载，这一情节更多是后世文学作品中的虚构内容。'

In [24]:
from langchain_experimental.text_splitter import SemanticChunker

help(SemanticChunker)

Help on class SemanticChunker in module langchain_experimental.text_splitter:

class SemanticChunker(langchain_core.documents.transformers.BaseDocumentTransformer)
 |  SemanticChunker(embeddings: langchain_core.embeddings.embeddings.Embeddings, buffer_size: int = 1, add_start_index: bool = False, breakpoint_threshold_type: Literal['percentile', 'standard_deviation', 'interquartile', 'gradient'] = 'percentile', breakpoint_threshold_amount: Optional[float] = None, number_of_chunks: Optional[int] = None, sentence_split_regex: str = '(?<=[.?!])\\s+', min_chunk_size: Optional[int] = None)
 |
 |  Split the text based on semantic similarity.
 |
 |  Taken from Greg Kamradt's wonderful notebook:
 |  https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
 |
 |  All credits to him.
 |
 |  At a high level, this splits into sentences, then groups into groups of 3
 |  sentences, and then merges one that are similar in t

In [26]:
semantic_chunker = SemanticChunker(embeddings=embedding_model)
texts = semantic_chunker.split_text(sanguo_content)
print(len(texts))
print(len(texts[0]))

1
606381
