<a href="https://colab.research.google.com/github/sr0uc0/CEHV12_StudyGuide/blob/main/infosec_rag_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: create a RAG application using mistralai/Mistral-7B-Instruct-v0.3 where the app will take into consideration infosec policy documents (pdf or  text) to answer queries which are ingested from the column of an excel file. The output should be another column in the same excel file

!pip install -q transformers accelerate bitsandbytes sentence_transformers faiss-cpu openpyxl pymupdf tiktoken openai langchain_community

In [2]:
!pip install -Uqq ipdb
import ipdb

In [3]:
!pip install huggingface_hub torch torchvision torchaudio mistral_inference tensorflow openpyxl protobuf google-search-results numexpr langchainhub sentencepiece jinja2 pylance pypdf



In [4]:
!pip install langchain-huggingface

Collecting langchain-huggingface
  Using cached langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Using cached langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [5]:
import os
import pandas as pd
import numpy as np
from typing import List, Dict, Any
from langchain_community.document_loaders import TextLoader, PyPDFLoader
# PDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.embeddings import HuggingFaceEmbeddings
from huggingface_hub import snapshot_download
from pathlib import Path
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

import fitz  # PyMuPDF
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
import tiktoken
import openai
from google.colab import drive

from sentence_transformers import SentenceTransformer, util

from google.colab import userdata




In [6]:
class InfoSecRAGApp:
    def __init__(self,
                 policy_docs_path: str,
                 model_name: str = "mistralai/Mistral-7B-Instruct-v0.3"):
        """
        Initialize the InfoSec RAG Application

        Args:
            policy_docs_path (str): Directory containing policy documents
            model_name (str): Hugging Face model for generation
        """
        self.policy_docs_path = policy_docs_path

        # Initialize embeddings
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )

        # Load and process policy documents
        self.vector_store = self._load_and_process_documents()

        # Initialize LLM
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        try:
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
        except Exception as e:
            print(f"Error loading model: {e}")

    def _load_and_process_documents(self) -> FAISS:
        """
        Load policy documents, split into chunks, create vector embeddings

        Returns:
            FAISS: Vector store of document embeddings
        """
        documents = []

        # Support multiple document types
        supported_extensions = ['.txt', '.pdf']

        for filename in os.listdir(self.policy_docs_path):
            filepath = os.path.join(self.policy_docs_path, filename)

            if os.path.splitext(filename)[1].lower() in supported_extensions:
                if filename.endswith('.pdf'):
                    loader = PyPDFLoader(filepath)
                else:
                    loader = TextLoader(filepath, encoding='utf-8')

                docs = loader.load()
                documents.extend(docs)

        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        texts = text_splitter.split_documents(documents)

        # Create vector store
        return FAISS.from_documents(texts, self.embeddings)

    def retrieve_relevant_context(self, query: str, top_k: int = 5) -> List[str]:
        """
        Retrieve most relevant document chunks for a query

        Args:
            query (str): Search query
            top_k (int): Number of top relevant chunks to retrieve

        Returns:
            List of relevant text chunks
        """
        retrieval_results = self.vector_store.similarity_search(query, k=top_k)
        return [doc.page_content for doc in retrieval_results]

    def generate_response(self, query: str, context: List[str]) -> str:
        """
        Generate response using RAG approach

        Args:
            query (str): User's query
            context (List[str]): Retrieved relevant contexts

        Returns:
            str: Generated response
        """
        # Construct prompt with context and query
        prompt = f"""
        Context Information:
        {' '.join(context)}

        Query: {query}

        Based on the provided context, please provide a detailed and precise answer
        to the query. If no clear answer can be found, state that insufficient
        information is available.
        """

        # Set max_new_tokens for desired response length
        max_new_tokens = 250  # Adjust this value based on your needs


        # Generate response
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            max_length=500,
            num_return_sequences=1,
            temperature=0.7
        )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def process_excel_queries(self, input_excel: str, output_excel: str, inputCol: str, outputCol: str):
        """
        Process queries from input Excel and generate responses

        Args:
            input_excel (str): Path to input Excel with queries
            output_excel (str): Path to output Excel with responses
        """
        # Read input Excel
        df = pd.read_excel(input_excel)

        # Add response column
        if inputCol in df.columns:
        # Apply process_single_query only if 'Query' column exists
            df[outputCol] = df[inputCol].apply(self.process_single_query)
        else:
            print("Error: 'Query' column not found in input Excel")

        # Save to output Excel
        df.to_excel(output_excel, index=False)
        print(f"Responses saved to {output_excel}")

    def process_single_query(self, query: str) -> str:
        """
        Process a single query through RAG pipeline

        Args:
            query (str): Individual query

        Returns:
            str: Generated response
        """
        context = self.retrieve_relevant_context(query)
        return self.generate_response(query, context)

In [7]:
def main():
    # Example usage

    # Set your Hugging Face API token as an environment variable

    drive.mount('/content/drive')

    # Get your Hugging Face token
    #  Go to your Hugging Face profile, then settings, then Access Tokens, then New Token
    #  Copy the token and paste it below, replacing "YOUR_HUGGINGFACE_TOKEN"

    HF_TOKEN = userdata.get('HF_TOKEN')
    # Save the token to an environment variable - this is how `transformers` library will pick it up
    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
    os.environ["HF_TOKEN"] = HF_TOKEN
    os.environ["HF_AUTH_TOKEN"] = HF_TOKEN

    # Explicitly set the cache directory to ensure consistent downloads
    cache_dir = "/content/drive/MyDrive/modelCache" # Replace with your desired cache directory
    os.makedirs(cache_dir, exist_ok=True) # Create the directory if it doesn't exist

    print(os.environ.get('HF_AUTH_TOKEN'))
    print(os.environ.get('HF_TOKEN'))

    # Example usage (replace with your file paths and column names)
    input_excel_file = "/content/drive/MyDrive/InfoSecQuestionnaires/infosec_queries.xlsx" # Replace with your excel file path
    output_excel_file = "/content/drive/MyDrive/InfoSecQuestionnaires/infosec_responses.xlsx" # Replace with your desired output excel file path

    policy_docs_dir = "/content/drive/MyDrive/InfoSecDocs" # Replace with your policy docs directory
    query_column = "Queries" # Replace with the name of your query column
    output_column = "Answers"  # Replace with the desired output column name

    rag_app = InfoSecRAGApp(
        policy_docs_path=policy_docs_dir,
    )

    rag_app.process_excel_queries(
        input_excel=input_excel_file,
        output_excel=output_excel_file,
        inputCol=query_column,
        outputCol=output_column)

In [8]:
if __name__ == "__main__":
    main()


Mounted at /content/drive
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
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


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Responses saved to /content/drive/MyDrive/InfoSecQuestionnaires/infosec_responses.xlsx
