# Генерация датасета

In [None]:
!pip install langchain pymupdf transformers torch langchain_community rank_bm25 ragatouille unstructured "unstructured[pdf]" update poppler-utils

За основу для генерации датасета - мы берём чанки из нормативных документов

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter( #20
    separators=["\n\n", "\n", " ", ""
    ],
    chunk_size=800,
    chunk_overlap=200
)

In [None]:
import pandas as pd
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
loader = DirectoryLoader("/kaggle/input/legall/Legal_doc_NEW", glob="**/*.pdf", show_progress=True, loader_cls=PyMuPDFLoader)#.load_and_split()
documents = loader.load_and_split(text_splitter)
#documents=loader
texts = [doc.page_content for doc in documents]
metadata = [doc.metadata for doc in documents]
docs_df = pd.DataFrame({'text': texts, 'metadata': metadata})
docs_df = docs_df[docs_df['text'].apply(len) >= 150]
docs_df = docs_df.reset_index(drop=True)
print(docs_df.head())


100%|██████████| 32/32 [00:03<00:00, 10.38it/s]


In [None]:
pip install openai


Генерация синтетического датасета, используя gpt-4o-mini. Генерация происходит батчами при помощи batch api openai

In [None]:
import os
import openai
from openai import OpenAI
import json
import random
import time
from datasets import Dataset, DatasetDict

os.environ['OPENAI_API_KEY'] = '' #chatgpt key
openai.api_key = os.getenv('OPENAI_API_KEY')
client = openai.OpenAI()

def generate_request_payload(text, custom_id, model="gpt-4o-mini", request_type="question"):
    if request_type == "question":
        message = f"Прочитайте следующий текст и задайте к нему вопрос. В вопросах ты не должен ссылаться напрямую на текст. Задвай вопрос так, как будто у тебя нет контекста: {text[:500]}"
    elif request_type == "negative_example":
        message = f"Напиши одно предложение, которое будет являться противоположным относительно Пример: {text[:500]}. Оно должно иметь стиль,структуру текста, как в примере, но при этом не быть слишком похожим. Пиши сразу ответы без нумерации и чего-то ещё лишнего."

    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model,
            "messages": [{"role": "user", "content": message}],
            "max_tokens": 100 if request_type == "question" else 600,
            "temperature": 0.3 if request_type == "question" else 0.2
        }
    }

def create_batch_file(texts, filename="batch_input.jsonl", request_type="question"):
    with open(filename, 'w', encoding='utf-8') as f:
        for i, text in enumerate(texts):
            request_payload = generate_request_payload(text, custom_id=f"request-{i+1}", request_type=request_type)
            f.write(json.dumps(request_payload, ensure_ascii=False) + '\n')
    return filename

def upload_batch_file(filename):
    with open(filename, "rb") as f:
        batch_input_file = client.files.create(file=f, purpose="batch")
    return batch_input_file.id

def create_batch(batch_input_file_id):
    batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": "synthetic data generation"}
    )
    return batch.id

def check_batch_status(batch_id, timeout=3600, interval=60):
    start_time = time.time()
    while True:
        batch = client.batches.retrieve(batch_id)
        status = batch.status
        if status == 'completed':
            return batch
        elif status in ['failed', 'expired']:
            raise Exception(f"Batch failed or expired with status: {status}")
        elif time.time() - start_time > timeout:
            raise TimeoutError("Batch status check timed out")
        time.sleep(interval)

def retrieve_batch_results(output_file_id):
    file_response = client.files.content(output_file_id)
    return file_response.text

def generate_synthetic_data_with_negatives(texts):
    questions_batch_file = create_batch_file(texts, filename="batch_questions.jsonl", request_type="question")
    questions_batch_file_id = upload_batch_file(questions_batch_file)
    questions_batch_id = create_batch(questions_batch_file_id)

    questions_batch = check_batch_status(questions_batch_id)
    questions_results = retrieve_batch_results(questions_batch.output_file_id)
    questions = {json.loads(result)['custom_id']: json.loads(result)['response']['body']['choices'][0]['message']['content'] for result in questions_results.splitlines()}

    negatives_batch_file = create_batch_file(texts, filename="batch_negatives.jsonl", request_type="negative_example")
    negatives_batch_file_id = upload_batch_file(negatives_batch_file)
    negatives_batch_id = create_batch(negatives_batch_file_id)

    negatives_batch = check_batch_status(negatives_batch_id)
    negatives_results = retrieve_batch_results(negatives_batch.output_file_id)
    negatives = {json.loads(result)['custom_id']: json.loads(result)['response']['body']['choices'][0]['message']['content'] for result in negatives_results.splitlines()}

    examples = []
    for i, text in enumerate(texts):
        example = {
            'queries': questions[f"request-{i+1}"],
            'positive': text,
            'negative': negatives[f"request-{i+1}"],
        }
        examples.append(example)

    return examples

texts = docs_df['text'].tolist()
all_synthetic_data = generate_synthetic_data_with_negatives(texts)


random.shuffle(all_synthetic_data)

total_texts = len(docs_df)
train_size = int(0.8 * total_texts)
val_size = int(0.15 * total_texts)
test_size = total_texts - train_size - val_size

train_examples = all_synthetic_data[:train_size]
val_examples = all_synthetic_data[train_size:train_size + val_size]
test_examples = all_synthetic_data[train_size + val_size:train_size + val_size + test_size]


with open('train_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(train_examples, f, ensure_ascii=False, indent=4)

with open('val_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(val_examples, f, ensure_ascii=False, indent=4)

with open('test_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(test_examples, f, ensure_ascii=False, indent=4)


train_dataset = Dataset.from_json('train_dataset.json')
val_dataset = Dataset.from_json('val_dataset.json')
test_dataset = Dataset.from_json('test_dataset.json')


dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
for i in test_dataset:
    print(i)


# Обучение эмбед модели

In [None]:
!pip install datasets sentence-transformers torch


In [None]:
from datasets import Dataset

train_dataset = Dataset.from_json('/kaggle/working/train_dataset.json')
val_dataset = Dataset.from_json('/kaggle/working/val_dataset.json')
test_dataset = Dataset.from_json('/kaggle/working/test_dataset.json')


train_dataset = train_dataset.rename_column('queries', 'anchor')
val_dataset = val_dataset.rename_column('queries', 'anchor')
test_dataset = test_dataset.rename_column('queries', 'anchor')


In [None]:
for i in train_dataset:
    print("Вопрос", i['anchor'])
    print("Позитив", i['positive'])
    print()
    print("Негатив", i['negative'])
    print()

In [None]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers.training_args import BatchSamplers


model = SentenceTransformer(
    "intfloat/multilingual-e5-small"
)

train_dataset = train_dataset
eval_dataset = val_dataset
test_dataset = test_dataset

loss = MultipleNegativesRankingLoss(model)

args = SentenceTransformerTrainingArguments(
    output_dir="models/zxc",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=5,
    save_strategy="steps",
    save_steps=5,
    save_total_limit=2,
    logging_steps=5,
    #run_name="asd",
)

dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="all-nli-dev",
)
dev_evaluator(model)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="all-nli-test",
)
test_evaluator(model)

model.save_pretrained("e5-small-fine_tune")


# Проверка embed model

In [None]:
!pip install langchain pymupdf transformers torch langchain_community ragatouille  faiss-gpu accelerate bitsandbytes unstructured "unstructured[pdf]" update poppler-utils

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
import faiss
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Any
import time
from IPython.display import Markdown, display



In [None]:
# Загрузка
model_ckpt ="/kaggle/working/e5-small-fine_tune" #"/kaggle/working/output/multilingual_bi_encoder-2024-07-24_02-50-45"#"/kaggle/working/mmmmmmmm" #"intfloat/multilingual-e5-small" #"/kaggle/working/output/train_bi-encoder-intfloat-multilingual-e5-small-2024-07-23_03-47-54"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

device = torch.device("cuda")
model.to(device)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter( #20
    separators=["\n\n", "\n", " ", ""
    ],
    chunk_size=800,
    chunk_overlap=200
)

# Загрузка документов
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
loader = DirectoryLoader("/kaggle/input/legall/Legal_doc_NEW", glob="**/*.pdf", show_progress=True, loader_cls=PyMuPDFLoader)#.load_and_split()
documents = loader.load_and_split(text_splitter)
#documents=loader
texts = [doc.page_content for doc in documents]
metadata = [doc.metadata for doc in documents]
docs_df = pd.DataFrame({'text': texts, 'metadata': metadata})

100%|██████████| 32/32 [00:02<00:00, 10.80it/s]


In [None]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list: List[str], batch_size: int = 0):
    embeddings = []
    for i in range(0, len(text_list)):
        batch = text_list[i]
        encoded_input = tokenizer(
            batch, padding=True, truncation=True, return_tensors="pt"
        )
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        with torch.no_grad():
            model_output = model(**encoded_input)
        batch_embeddings = cls_pooling(model_output).detach().cpu().numpy()
        embeddings.append(batch_embeddings)
        torch.cuda.empty_cache()
    return np.vstack(embeddings)


embeddings = get_embeddings(texts, batch_size=0)
#
#import numpy as np
#import pickle

#def save_embeddings_to_drive(embeddings, file_path):
#    with open(file_path, 'wb') as f:
#        pickle.dump(embeddings, f)
#

#embeddings_file_path = '/content/drive/MyDrive/embeddings.npy'
#save_embeddings_to_drive(embeddings, embeddings_file_path)


In [None]:
import pickle
import numpy as np
embeddings_file_path = '/kaggle/working/embeddings.pkl'
def load_embeddings_from_drive(file_path):
    with open(file_path, 'rb') as f:
        embeddings = pickle.load(f)
    return embeddings

# Пример загрузки
embeddings = load_embeddings_from_drive(embeddings_file_path)


In [None]:
#def save_embeddings_to_drive(embeddings, file_path):
#    with open(file_path, 'wb') as f:
#        pickle.dump(embeddings, f)


#embeddings_file_path = '/kaggle/working/embeddings.pkl'
#save_embeddings_to_drive(embeddings, embeddings_file_path)

In [None]:

faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings)

faq_data = pd.read_excel('/kaggle/input/legall/filtered_updated_faq_data (2) (3).xlsx')

In [None]:
import re

def extract_numbers(text):
    numbers = re.findall(r'(\d+)\)', text)
    filtered_numbers = [num for num in numbers if len(num) <= 2]
    return ','.join(filtered_numbers)


faq_data['PDF Title'] = faq_data['PDF Title'].apply(extract_numbers)


In [None]:

def clean_sets(sets):
    cleaned_sets = []
    for s in sets:
        new_set = set()
        for item in s:
            if ',' in item:
                new_set.update(filter(None, item.split(',')))
            elif item != ',':
                new_set.add(item)
        cleaned_sets.append(new_set)
    return cleaned_sets


In [None]:
qwe=[]
def perform_combined_search(
    question: str,
    embeddings: List[List[float]],
    faiss_index: faiss.Index,
    num_retrieved_docs: int = 3,
    rewrite_question_flag: bool = False,
) -> Tuple[List[Tuple[str, float, str]], float]:
    start_time = time.time()

    qwe.append(question)
    question_embedding = get_embeddings(['query:'+question])
    D, I = faiss_index.search(question_embedding, num_retrieved_docs)
    faiss_results = [(docs_df.iloc[i], D[0][j]) for j, i in enumerate(I[0])]

    combined_results = {}
    for doc, score in faiss_results:
        file_path = doc['metadata']['file_path']
        text_fragment = doc['text']
        if file_path not in combined_results or score < combined_results[file_path][0]:
            combined_results[file_path] = (score, text_fragment)

    combined_results = combined_results.items()
    retrieval_time = time.time() - start_time
    return [(extract_numbers(res[0]), res[1][0], res[1][1]) for res in combined_results], retrieval_time

results = []

for _, row in faq_data.iterrows():
    question = row['Question']
    expected_file_paths = set(row['PDF Title'].split(','))

    unique_file_paths_with_scores, retrieval_time = perform_combined_search(question, embeddings, faiss_index)
    unique_file_paths = set([res[0] for res in unique_file_paths_with_scores])

    cleaned_expected_file_paths = clean_sets([expected_file_paths])[0]
    cleaned_unique_file_paths = clean_sets([unique_file_paths])[0]

    true_positives = len(cleaned_unique_file_paths & cleaned_expected_file_paths)
    false_positives = len(cleaned_unique_file_paths - cleaned_expected_file_paths)
    false_negatives = len(cleaned_expected_file_paths - cleaned_unique_file_paths)

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)

    results.append({
        'Question': question,
        'Expected File Paths': cleaned_expected_file_paths,
        'Predicted File Paths with Scores': unique_file_paths_with_scores,
        'True Positives': true_positives,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score,
        'Retrieval Time (s)': retrieval_time,
        'Rerank Time (s)': 0
    })

results_df = pd.DataFrame(results)

total_true_positives = results_df['True Positives'].sum()
total_false_positives = results_df['False Positives'].sum()
total_false_negatives = results_df['False Negatives'].sum()

average_precision = results_df['Precision'].mean()
average_recall = results_df['Recall'].mean()
average_f1_score = results_df['F1 Score'].mean()

print("-------------------------------------------------------------------")
print(f"Total True Positives: {total_true_positives}")
print(f"Total False Positives: {total_false_positives}")
print(f"Average Precision: {average_precision:.2f}")
print(f"Average Recall: {average_recall:.2f}")
print(f"Average F1 Score: {average_f1_score:.2f}")
print("-------------------------------------------------------------------")
#
#
#
#def display_markdown(text):
#    display(Markdown(text))
#
#for index, row in results_df.iterrows():
#    print(row['Question'])
#    display_markdown(f"# Вопрос: {row['Question']}")
#    #display_markdown(f"Если есть llm: {qwe[index]}")
#    display_markdown(f"## **Нужные PDF:**")
#    for path in row['Expected File Paths']:
#        display_markdown(f" - **{path}**")
#    print()
#    display_markdown("## **Предсказанные PDF:**")
#    for path, score, text_fragment in row['Predicted File Paths with Scores']:
#        display_markdown(f"  - **{path} (Растояние: {score:.2f})**")
#        #display_markdown(f"**Фрагмент текста:**")
#        print(f"{text_fragment}")
#        print()