Задача:
Выполнить оценку работы библиотеки по извлечению текста новости. Необходимо определить полноту извлекаемого текста и оценить значимость тех частей текста, которые выделить не удалось. Проанализировать и предложить изменения в алгоритм работы библиотеки.
 
Желаемый результат:
Заполненная таблица -
с эталоном текста для каждой новости;
с результатами оценки полноты (сравнения эталона текста и текста, извлеченного библиотекой);
оценка значимости текста, который выделить не удалось;
комментарии, если есть предложения по улучшению.

In [None]:
# 📦 Стандартные библиотеки
import os
import re
from typing import List
from operator import itemgetter

# 📦 Сторонние библиотеки
import pandas as pd
from pydantic import BaseModel, Field

# 🚀 LangChain и расширения
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers import (
    PydanticOutputParser,
    StructuredOutputParser,
    OutputFixingParser,
)
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.runnables import (
    Runnable,
    RunnablePassthrough,
    chain,
    RunnableConfig,
)

# 🔎 Graph Retriever
from langchain_graph_retriever import GraphRetriever
from graph_retriever.strategies import Eager
from langchain.schema import Document  # Если нужен именно этот Document
from dotenv import load_dotenv
import glob
from langchain_core.documents import Document
from keybert import KeyBERT
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_deepseek import ChatDeepSeek

In [None]:
# import pandas as pd

# from pydantic import BaseModel, Field
# from typing import Dict, Any, List, Optional

# from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
# from langchain_core.output_parsers import PydanticOutputParser, StrOutputParser
# from langchain.output_parsers import OutputFixingParser
# from langchain_core.tools import tool
# from langchain_core.runnables import RunnablePassthrough
# from langchain.agents import initialize_agent, AgentType


# from langchain_huggingface.llms import HuggingFacePipeline
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# gpu_llm = HuggingFacePipeline.from_model_id(
#     model_id="../models/Mistral-7B-Instruct-v0.3",
#     task="text-generation",
#     pipeline_kwargs={"max_new_tokens": 50},
# )


# cpu_llm = HuggingFacePipeline.from_model_id(
#     model_id="../models/DeepSeek-R1-Distill-Qwen-7B",
#     task="text-generation",
#     pipeline_kwargs={"max_new_tokens": 50},
# )


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cpu


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [2]:

template = """Question: {question}

Answer: Let's think step by step."""
prompt = PromptTemplate.from_template(template)


gpu_chain = prompt | gpu_llm

question = "What is electroencephalography?"

print(gpu_chain.invoke({"question": question}))

Question: What is electroencephalography?

Answer: Let's think step by step.

Electroencephalography (EEG) is a technique used to record the electrical activity of the brain.

Here's a breakdown:

1. 'Electro': Related to electricity.

2


In [None]:
template = """Question: {question}

Answer: Let's think step by step."""
prompt = PromptTemplate.from_template(template)

cpu_chain = prompt | cpu_llm

question = "What is electroencephalography?"

print(cpu_chain.invoke({"question": question}))

Question: What is electroencephalography?

Answer: Let's think step by step. Electroencephalography, or EEG, is a method used to record electrical activity in the brain. It works by placing small sensors, called electrodes, on the scalp. These electrodes detect the electrical signals produced by the brain's neurons as they communicate


## Load data

In [None]:
data_test_web = pd.read_csv("tmp/data_test_web.csv")
data_test_web = data_test_web.dropna(subset=["web_text"], axis='raws').reset_index(drop=True)
data_test_web.head()