## Environment Setup

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
HF_TOKEN = os.getenv("HF_TOKEN")
assert HF_TOKEN is not None, "HF_TOKEN not found in environment"

## Imports

In [3]:
from huggingface_hub import InferenceClient
from langchain_core.runnables import RunnableLambda
from langchain_classic.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

## Embedding Model

In [4]:
embedding_model = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

  embedding_model = HuggingFaceEmbeddings(


## Domain -> Vector Index Registry

In [5]:
DOMAIN_INDEX_PATHS = {
    "artificial_intelligence": "../indexes/artificial_intelligence",
    "medical": "../indexes/medical",
    "climate": "../indexes/climate",
    "cyber_security": "../indexes/cyber_security",
    "business": "../indexes/business",
    "psychology": "../indexes/psychology",
    "automobile": "../indexes/automobile"
}

## Domain-Specific Retrieval Depth

In [6]:
DOMAIN_K = {
    "artificial_intelligence": 16,
    "medical": 12,
    "cyber_security": 14,
    "climate": 12,
    "business": 10,
    "psychology": 10,
    "automobile": 8
}

## Load FAISS Database for a Domain

In [7]:
def load_vector_db(domain: str):
    if domain not in DOMAIN_INDEX_PATHS:
        raise ValueError(f"Unsupported domain: {domain}")
    
    return FAISS.load_local(
        DOMAIN_INDEX_PATHS[domain],
        embedding_model,
        allow_dangerous_deserialization=True
    )

## RAG Prompt

In [8]:
CUSTOM_PROMPT_TEMPLATE = """
Use the pieces of information provided in the context to answer the user's question.
If you do not know the answer, say that you do not know.
Do not make up an answer.
Do not use information outside the given context.

Context:
{context}

Question:
{question}

Answer directly. No small talk.
"""

In [9]:
prompt = PromptTemplate(
    template = CUSTOM_PROMPT_TEMPLATE,
    input_variables=["context", "question"]
)

## Hugging Face Interface Client

In [10]:
client = InferenceClient(
    model = "meta-llama/Meta-Llama-3-8B-Instruct",
    token = HF_TOKEN
)

## LLM Wrapper

In [11]:
def hf_llm_call(prompt: str, **kwargs) -> str:
    if hasattr(prompt, "to_string"):
        prompt = prompt.to_string()
    
    response = client.chat_completion(
        messages = [{"role": "user", "content": prompt}],
        max_tokens = 512,
        temperature = 0.2
    )
    
    return response.choices[0].message.content

In [12]:
llm = RunnableLambda(hf_llm_call)

## Create Domain Specific QA Chain

In [13]:
def create_qa_chain(domain: str):
    db = load_vector_db(domain)
    k = DOMAIN_K.get(domain, 10)
    
    qa_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type = "stuff",
        retriever = db.as_retriever(search_kwargs={"k": k}),
        return_source_documents = True,
        chain_type_kwargs = {"prompt": prompt}
    )
    return qa_chain

## Unified Question Answering Function

In [14]:
def answer_question(question: str, domain: str):
    qa_chain = create_qa_chain(domain)
    response = qa_chain.invoke({"query": question})
    return response

## Test - Artificial Intelligence Domain

In [15]:
query = "What are the recent research trends in Artificial Intelligence?"

In [16]:
response = answer_question(
    question = query,
    domain = "artificial_intelligence"
)

In [17]:
print("ANSWER:\n", response["result"])
print("\nSOURCES:")
for doc in response["source_documents"]:
    print(doc.metadata.get("source"))

ANSWER:
 The recent research trends in Artificial Intelligence include:

1. Exponential growth in the number of publications on AI, with a nearly 20-fold increase from 7 papers in 2014 to 200 papers in 2024.
2. A deepening research focus in the field of AI, with a significant increase in the number of publications in the years 2019-2024.
3. The development of new applications of AI, such as speech recognition systems, inventory control systems, surveillance systems, robots, and search engines.
4. The creation of new jobs in machine learning, data science, and other related fields.
5. The growth of new businesses and investment opportunities due to the endless applications of AI in various fields, including agriculture, education, transportation, finance, biotechnology, cybersecurity, and gaming.

These trends indicate a significant increase in research and development in the field of AI, with a focus on creating new applications and opportunities.

SOURCES:
AI computer science 4.pdf
AI

In [18]:
query = "What are common cyber security attack vectors?"

response = answer_question(
    question=query,
    domain="cyber_security"
)

print("ANSWER:\n", response["result"])


ANSWER:
 According to the provided context, common cyber security attack vectors include:

1. Phishing: a practice of fooling individuals into exposing sensitive information by employing dishonest methods, such as bogus emails, websites, or ideas.
2. Malware: a hateful operating system, such as viruses, worms, Trojan stallions, ransomware, spyware, or adware, that aims to penetrate calculating systems, steal data, restrict workflow, or cause harm.
3. DDoS & DoS Attack: a type of attack that aims to classify network traces as normal or anomalous, and can be used to overwhelm a system or network with traffic, making it difficult to function.
4. Social engineering: an action taken by opponents to deceive individuals into revealing sensitive information, which can be used to make them more inclined to click on links, spread malware, or support distressing causes.
