# Using Open Source Models for Summarization/QA/Embeddings

we will use google open source model gemma 2b

In [1]:
import cProfile
import pstats
import io
import logging
import torch
from transformers import AutoTokenizer, pipeline
from huggingface_hub import login
from sentence_transformers import SentenceTransformer
from raptor.SummarizationModels import BaseSummarizationModel
from raptor.QAModels import BaseQAModel
from raptor.EmbeddingModels import BaseEmbeddingModel
from raptor.RetrievalAugmentation import RetrievalAugmentationConfig
from raptor.RetrievalAugmentation import RetrievalAugmentation

import os

2024-06-08 15:03:29,422 - Loading faiss with AVX2 support.
2024-06-08 15:03:29,901 - Successfully loaded faiss with AVX2 support.


In [3]:

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Authenticate with HuggingFace if needed
login()

class GEMMASummarizationModel(BaseSummarizationModel):
    def __init__(self, model_name="google/gemma-2b-it"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.summarization_pipeline = pipeline(
            "text-generation",
            model=model_name,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        )

    def summarize(self, context, max_tokens=150):
        messages = [{"role": "user", "content": f"Write a summary of the following, including as many key details as possible: {context}:"}]
        prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        outputs = self.summarization_pipeline(prompt, max_new_tokens=max_tokens, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
        summary = outputs[0]["generated_text"].strip()
        return summary

class GEMMAQAModel(BaseQAModel):
    def __init__(self, model_name="google/gemma-2b-it"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.qa_pipeline = pipeline(
            "text-generation",
            model=model_name,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        )

    def answer_question(self, context, question):
        messages = [{"role": "user", "content": f"Given Context: {context} Give the best full answer amongst the option to question {question}"}]
        prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        outputs = self.qa_pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
        answer = outputs[0]["generated_text"][len(prompt):]
        return answer

class SBertEmbeddingModel(BaseEmbeddingModel):
    def __init__(self, model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1"):
        self.model = SentenceTransformer(model_name)

    def create_embedding(self, text):
        return self.model.encode(text)

def main():
    logger.info("Initializing models and configurations")
    RAC = RetrievalAugmentationConfig(
        summarization_model=GEMMASummarizationModel(),
        qa_model=GEMMAQAModel(),
        embedding_model=SBertEmbeddingModel()
    )

    RA = RetrievalAugmentation(config=RAC)

    input_dir = r'C:\Users\salah\final chabot\documents\cleaned_splited_doc'  # Directory containing the split files
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, 'r', encoding='utf-8') as file:  # Specify UTF-8 encoding
                RA.add_documents(file.read())

    question = "En cas de collision avec un autre véhicule, de choc contre un corps fixe ou mobile, ou de renversement sans collision préalable, du véhicule assuré, l’assureur garantit quoi?"
    
    logger.info("Answering question: %s", question)
    answer = RA.answer_question(question=question)
    logger.info("Answer: %s", answer)
    print("Answer: ", answer)

if __name__ == "__main__":
    main()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

2024-06-08 15:15:30,646 - Initializing models and configurations


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2024-06-08 15:16:11,592 - Use pytorch device_name: cpu
2024-06-08 15:16:11,624 - Load pretrained SentenceTransformer: sentence-transformers/multi-qa-mpnet-base-cos-v1
2024-06-08 15:16:20,964 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 100
            Summarization Model: <__main__.GEMMASummarizationModel object at 0x000001F52699E300>
            Embedding Models: {'EMB': <__main__.SBertEmbeddingModel object at 0x000001F4C2FD7C20>}
            Cluster Embedding Model: EMB
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2024-06-08 15:16:20,973 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encod

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-06-08 15:16:22,031 - Created 2 Leaf Embeddings
2024-06-08 15:16:22,032 - Building All Nodes
2024-06-08 15:16:22,034 - Using Cluster TreeBuilder
2024-06-08 15:16:22,035 - Constructing Layer 0
2024-06-08 15:16:22,036 - Stopping Layer construction: Cannot Create More Layers. Total Layers in tree: 0
2024-06-08 15:16:22,037 - Successfully initialized TreeRetriever with Config 
        TreeRetrieverConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Context Embedding Model: EMB
            Embedding Model: <__main__.SBertEmbeddingModel object at 0x000001F4C2FD7C20>
            Num Layers: None
            Start Layer: None
        
2024-06-08 15:16:27,713 - Answering question: En cas de collision avec un autre véhicule, de choc contre un corps fixe ou mobile, ou de renversement sans collision préalable, du véhicule assuré, l’assureur garantit quoi?
2024-06-08 15:16:27,714 - Using collapsed_

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-06-08 15:24:20,149 - Answer: L’assureur garantit le coût de réparation ou la valeur du véhicule endommagé.


Answer:  L’assureur garantit le coût de réparation ou la valeur du véhicule endommagé.


In [8]:
import torch

torch.cuda.empty_cache()

# openai api


In [28]:
import os
os.environ["OPENAI_API_KEY"] = "open_api_key"

In [24]:
from raptor import GPT3QAModel, RetrievalAugmentationConfig, RetrievalAugmentation

# Initialize the QA model
qa_model = GPT3QAModel(BaseQAModel)  # You don't need the extra parenthesis

# Initialize the RetrievalAugmentationConfig with your QA model
RAC = RetrievalAugmentationConfig(
    qa_model=qa_model  # Remove the extra parenthesis here
)

# Initialize the RetrievalAugmentation with the configuration
RA = RetrievalAugmentation(config=RAC)

2024-06-08 16:53:43,600 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 100
            Summarization Model: <raptor.SummarizationModels.GPT3TurboSummarizationModel object at 0x000001F5345F55B0>
            Embedding Models: {'OpenAI': <raptor.EmbeddingModels.OpenAIEmbeddingModel object at 0x000001F5345F4800>}
            Cluster Embedding Model: OpenAI
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2024-06-08 16:53:43,601 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
   

In [29]:
input_dir = r'C:\Users\salah\final chabot\documents\cleaned_splited_doc'  # Directory containing the split files
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:  # Specify UTF-8 encoding
                RA.add_documents(file.read())

2024-06-08 16:59:34,104 - Creating Leaf Nodes


2024-06-08 16:59:34,525 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-06-08 16:59:34,530 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-06-08 16:59:35,137 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-06-08 16:59:35,751 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-06-08 16:59:36,744 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-06-08 16:59:36,844 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-06-08 16:59:37,917 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-06-08 16:59:39,058 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-06-08 16:59:43,328 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-06-08

RetryError: RetryError[<Future at 0x1f52d0f13d0 state=finished raised AuthenticationError>]

In [25]:
question = "En cas de collision avec un autre véhicule, de choc contre un corps fixe ou mobile, ou de renversement sans collision préalable, du véhicule assuré, l’assureur garantit quoi?"

answer = RA.answer_question(question=question)

print("Answer: ", answer)

ValueError: The TreeRetriever instance has not been initialized. Call 'add_documents' first.

In [None]:
# Save the tree by calling RA.save("path/to/save")
SAVE_PATH = "C:\Users\salah\final chabot\Tree"
RA.save(SAVE_PATH)

In [None]:
# load back the tree by passing it into RetrievalAugmentation

RA = RetrievalAugmentation(tree=SAVE_PATH)

answer = RA.answer_question(question=question)
print("Answer: ", answer)