In [None]:
!pip install langchain chromadb sentence-transformers tiktoken backoff tenacity
!pip install datasets transformers evaluate



In [38]:
!pip install --upgrade openai


[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 324, in run
    session = self.get_default_session(options)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/index_command.py", line 71, in get_default_session
    self._session = self.enter_context(self._build_session(options))
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/index_command.py", line 100, in _build_sess

In [None]:
!pip install langchain-community



In [None]:
import os
from typing import List, Dict, Any
from dataclasses import dataclass
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader, PyPDFLoader, WebBaseLoader
from langchain.chains import RetrievalQA
from datasets import load_dataset
import numpy as np
from transformers import pipeline
from google.colab import userdata
import tempfile
from openai import OpenAI



In [None]:
@dataclass
class RAGConfig:
    # LLM API Configuration
    llm_api_key: str  # Your LLM API key
    llm_api_base_url: str  # Your LLM API base URL
    llm_api_version: str = "2024-01-01"  # API version if required
    llm_deployment_name: str = None  # For Azure deployment

    # Embedding Configuration
    embedding_api_key: str = None  # If using API-based embeddings
    embedding_model: str = "sentence-transformers/all-mpnet-base-v2"  # Default to local model

    # RAG Configuration
    chunk_size: int = 500
    chunk_overlap: int = 50
    k_retrieval: int = 4

    def __post_init__(self):
        if not self.llm_api_key or not self.llm_api_base_url:
            raise ValueError("LLM API key and base URL are required")

In [None]:
class Deepseek_query():

    def __init__(self, config: RAGConfig):
        self.HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {config.llm_api_key}'

      }
        self.config = config


    def get_deepseek_response(self,system_content, user_content):

      client = OpenAI(api_key=self.config.llm_api_key, base_url="https://api.deepseek.com")

      response = client.chat.completions.create(
      model="deepseek-chat",
      messages=[
          {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
      ],
        max_tokens=1024,
      temperature=0.7,
      stream=False
      )



      if response:

          return response.choices[0].message.content
      else:
          raise Exception(f"Error {response.status_code}: {response.text}")

In [None]:
class QueryProcessor:
    def __init__(self):
        self.intent_classifier = pipeline("zero-shot-classification",
                                       model="facebook/bart-large-mnli")

    def understand_query(self, query: str) -> Dict[str, Any]:
        # Perform intent classification
        # Pass candidate_labels to the pipeline, not the tokenizer
        intents = self.intent_classifier(query, candidate_labels=["factual", "analytical", "procedural"])
        #print(intents)
        #intents = intents # Get the first element of the list which contains prediction

        return {
            "original_query": query,
            "intent": intents["labels"][0],
            "confidence": intents["scores"][0]
        }

    def decompose_query(self, query_info: Dict[str, Any]) -> List[str]:
        if query_info["intent"] == "analytical":
            # Break analytical queries into sub-questions
            sub_queries = [
                f"What are the key facts about {query_info['original_query']}?",
                f"What are the relationships between different aspects of {query_info['original_query']}?",
                f"What are the implications of {query_info['original_query']}?"
            ]
        else:
            sub_queries = [query_info['original_query']]

        return sub_queries

In [None]:
class KnowledgeBase:
    def __init__(self, config: RAGConfig):
        self.embeddings = HuggingFaceEmbeddings(model_name=config.embedding_model)
        self.vector_store = None
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.chunk_size,
            chunk_overlap=config.chunk_overlap
        )
        self.config = config

    def ingest_documents(self, documents: List[str], source_type: str = "text"):
        # Process and store documents
        if source_type == "text":
            docs = []
            for doc in documents:
                with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
                  temp_file.write(doc)
                  temp_file_path = temp_file.name

                docs.extend(TextLoader(temp_file_path).load())

            # Remove temporary file
                os.remove(temp_file_path)


        elif source_type == "pdf":
            docs = [PyPDFLoader(doc).load() for doc in documents]

        # Split documents
        split_docs = self.text_splitter.split_documents(docs)

        # Create or update vector store
        self.vector_store = Chroma.from_documents(
            documents=split_docs,
            embedding=self.embeddings
        )

    def retrieve(self, query: str) -> List[str]:
        if not self.vector_store:
            raise ValueError("No documents ingested yet!")

        return self.vector_store.similarity_search(query, k=self.config.k_retrieval)

In [None]:
class Agent:
    def __init__(self, name: str, role: str, config: RAGConfig):
        self.name = name
        self.role = role
        self.config = config
        self.state = {}

In [None]:
class SearchAgent(Agent):
    def __init__(self, config: RAGConfig):
        super().__init__('search', 'retrieval', config)

    def retrieve(self, query: str, knowledge_base: KnowledgeBase) -> List[str]:
        """Retrieve relevant documents using the knowledge base"""
        retrieved_docs = knowledge_base.retrieve(query)
        return retrieved_docs




class ReasoningAgent(Agent):
    def __init__(self, config: RAGConfig):
        super().__init__('reason', 'analysis', config)
        self.client = Deepseek_query(config)

    def analyze(self, query: str, retrieved_docs: List[str]) -> str:
        """Analyze retrieved documents and generate reasoning"""

        context = "\n".join([str(doc) for doc in retrieved_docs])

        system_prompt = "You are a reasoning agent that analyzes information."
        user_prompt = f"Analyze the following information in relation to the query. \n Query : {query} \n Context: {context} \n Provide a coherent analysis focusing on the most relevant information."

        response = self.client.get_deepseek_response(system_prompt, user_prompt)


        return response

class ResponseAgent(Agent):
    def __init__(self, config: RAGConfig):
        super().__init__('response', 'generation', config)
        self.client = Deepseek_query(config)

    def generate(self, query: str, analysis: str) -> str:
        """Generate final response based on analysis"""



        system_prompt = "You are a response agent that generates clear and comprehensive answers."
        response_prompt = f"""Generate a comprehensive response to the query based on the analysis.
        Query: {query}

        Analysis:
        {analysis}

        Provide a clear and well-structured response."""

        response = self.client.get_deepseek_response(system_prompt, response_prompt)



        return response


In [None]:
class ReflectiveLayer:
    def __init__(self,config: RAGConfig):
        self.performance_metrics = []
        self.strategy_history = []
        self.config = config

    def evaluate_response(self, query: str, response: str, retrieved_docs: List[str]) -> float:
        # Implement various evaluation metrics
        relevance_score = self._calculate_relevance(query, retrieved_docs)
        coherence_score = self._calculate_coherence(response)

        # Store metrics
        self.performance_metrics.append({
            'query': query,
            'relevance': relevance_score,
            'coherence': coherence_score
        })

        return (relevance_score + coherence_score) / 2

    def _calculate_relevance(self, query: str, docs: List[str]) -> float:
        # Simplified relevance scoring
        return np.random.uniform(0.7, 1.0)  # Replace with actual metric

    def _calculate_coherence(self, response: str) -> float:
        # Simplified coherence scoring
        return np.random.uniform(0.7, 1.0)  # Replace with actual metric

    def adjust_strategy(self, current_performance: float) -> Dict[str, Any]:
        # Adjust retrieval and processing strategy based on performance
        if current_performance < 0.8:
            new_strategy = {
                'k_retrieval': self.config.k_retrieval + 2,
                'rewrite_query': True
            }
        else:
            new_strategy = {
                'k_retrieval': self.config.k_retrieval,
                'rewrite_query': False
            }

        self.strategy_history.append(new_strategy)
        return new_strategy

In [None]:
class AgentOrchestrator:
    def __init__(self, config: RAGConfig):
        self.config = config
        self.agents = {
            'search': SearchAgent(config),
            'reason': ReasoningAgent(config),
            'response': ResponseAgent(config)
        }

    def coordinate(self, query: str, knowledge_base: KnowledgeBase,
                  reflective_layer: ReflectiveLayer) -> str:

        # Search agent retrieves relevant documents
        retrieved_docs = self.agents['search'].retrieve(query, knowledge_base)
        if not retrieved_docs:
            return "No relevant information found."

        # Reasoning agent analyzes the information
        analysis = self.agents['reason'].analyze(query, retrieved_docs)

        # Response agent generates the final answer
        response = self.agents['response'].generate(query, analysis)

        # Evaluate and adjust
        performance = reflective_layer.evaluate_response(query, response, retrieved_docs)
        new_strategy = reflective_layer.adjust_strategy(performance)

        return response

In [None]:
class ReflectiveRAG:
    def __init__(self, config: RAGConfig):
        self.config = config
        self.query_processor = QueryProcessor()
        self.knowledge_base = KnowledgeBase(config)
        self.reflective_layer = ReflectiveLayer(config)
        self.orchestrator = AgentOrchestrator(config)

    def load_sample_dataset(self):
        # Load sample dataset from HuggingFace
        dataset = load_dataset("squad", split="train[:1000]")
        contexts = [item['context'] for item in dataset]

        # Ingest documents
        self.knowledge_base.ingest_documents(contexts)

    def answer_query(self, query: str) -> str:
        # Process query
        query_info = self.query_processor.understand_query(query)
        sub_queries = self.query_processor.decompose_query(query_info)

        # Get response through orchestrator
        response = self.orchestrator.coordinate(
            query=query,
            knowledge_base=self.knowledge_base,
            reflective_layer=self.reflective_layer
        )

        return response

In [None]:
def main():
    # Initialize system
    config = RAGConfig(
    llm_api_key=userdata.get('llm_api_key'),
    llm_api_base_url=userdata.get('llm_api_url')
)

    rag_system = ReflectiveRAG(config)

    # Load sample dataset
    rag_system.load_sample_dataset()

    # Test query
    query = "What are the main factors affecting climate change?"
    response = rag_system.answer_query(query)
    print(f"Query: {query}")
    print(f"Response: {response}")

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu
  self.embeddings = HuggingFaceEmbeddings(model_name=config.embedding_model)


Query: What are the main factors affecting climate change?
Response: Climate change is a complex phenomenon driven by a variety of factors, primarily resulting from human activities and natural processes. Below is a comprehensive breakdown of the **main factors affecting climate change**:

### 1. **Greenhouse Gas Emissions**
   - **Carbon Dioxide (CO₂)**: Released through the burning of fossil fuels (coal, oil, and natural gas) for energy, transportation, and industrial processes. Deforestation also contributes by reducing the number of trees that absorb CO₂.
   - **Methane (CH₄)**: Emitted during the production and transport of coal, oil, and natural gas. It is also released by livestock and other agricultural practices, as well as the decay of organic waste in landfills.
   - **Nitrous Oxide (N₂O)**: Produced by agricultural and industrial activities, as well as the combustion of fossil fuels and solid waste.

### 2. **Deforestation**
   - Forests act as carbon sinks, absorbing CO₂ f

In [None]:
def test_deepseek():
    # Initialize system
    config = RAGConfig(
    llm_api_key=userdata.get('llm_api_key'),
    llm_api_base_url=userdata.get('llm_api_url')
    )

    DD = Deepseek_query(config)
    print(DD.get_deepseek_response("You are a helpful assistant","What is photosyntheis?"))



In [None]:
test_deepseek()

In [None]:
### test reflective layer
def test_reflective():
    config = RAGConfig(
      llm_api_key=userdata.get('llm_api_key'),
      llm_api_base_url=userdata.get('llm_api_url')
      )
    RR = ReflectiveLayer(config)

In [None]:
cd