<a href="https://colab.research.google.com/github/saliSoul/Q-A-ai-powered-System/blob/main/QA_Ai_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary dependencies
!pip install langchain chromadb sentence-transformers transformers rank_bm25 pypdf langchain-community huggingface_hub


In [None]:
import os
import shutil
import google.colab
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from rank_bm25 import BM25Okapi

# Path for ChromaDB storage
CHROMA_DB_DIR = "Chroma"



In [None]:
#Upload Files ()
def upload_files():
    """Upload PDF or TXT files to Colab."""
    print("📂 Upload your PDF or TXT files...")
    uploaded = google.colab.files.upload()
    data_dir = "/content/data"
    os.makedirs(data_dir, exist_ok=True)

    for filename in uploaded.keys():
        file_path = os.path.join(data_dir, filename)
        with open(file_path, "wb") as f:
            f.write(uploaded[filename])

    print(f"✅ Uploaded {len(uploaded)} files.")
    return data_dir


In [None]:
# Load Documents
def load_documents(directory):
    """Load text and PDF files from a directory efficiently."""
    docs = []
    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)
        if file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif file.endswith(".txt"):
            loader = TextLoader(file_path)
        else:
            continue
        docs.extend(loader.load())
    return docs


In [None]:
#To Upload and process new documents
data_directory = upload_files()
documents = load_documents(data_directory)

if not documents:
    raise ValueError("No valid documents found! Please upload PDF or TXT files.")

📂 Upload your PDF or TXT files...


✅ Uploaded 0 files.


In [None]:
#Dynamic Chunking (Adjusts based on document size)
def adaptive_chunking(docs):
    """Dynamically chunk documents based on size."""
    avg_length = sum(len(doc.page_content) for doc in docs) / len(docs)
    chunk_size = min(512, int(avg_length / 2))
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=int(chunk_size * 0.2)
    )
    return text_splitter.split_documents(docs)

chunks = adaptive_chunking(documents)


In [None]:
# Embedding using "bge-base-en"
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en")

#ChromaDB for Vector Storage
vector_store = Chroma.from_documents(splits, embeddings, persist_directory=CHROMA_DB_DIR)

vector_store = Chroma(persist_directory=CHROMA_DB_DIR, embedding_function=embeddings)

#Using BM25 for Sparse Retrieval (Lexical Matching)
bm25 = BM25Okapi([doc.page_content.split() for doc in splits])

qa_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    max_new_tokens=200,
    temperature=0.5,
    top_p=0.85
)
llm = HuggingFacePipeline(pipeline=qa_pipeline)

Device set to use cpu


In [None]:
#Prompt template
PROMPT_TEMPLATE = """
You are an AI assistant. Use ONLY the provided context to answer.

Context:
{context}

Question: {question}

Give a well-structured, accurate response.
"""

prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=["context", "question"])

#Hybrid Retrieval: Combines BM25 + ChromaDB for better search
def retrieve_documents(query, top_k=5):
    dense_results = vector_store.similarity_search(query, k=top_k)
    sparse_results = bm25.get_top_n(query.split(), splits, n=top_k)

    # Merge results ( to avoid duplicates)
    combined_results = {doc.page_content: doc for doc in (dense_results + sparse_results)}
    return list(combined_results.values())

In [None]:
def answer_question(question):
    """Retrieve relevant documents and generate an answer."""
    results = retrieve_documents(question)

    if not results:
        return "No relevant information found."

    context_text = "\n\n---\n\n".join([doc.page_content for doc in results])

    #Formating prompt
    formatted_prompt = prompt.format(context=context_text, question=question)

    #Get the response from LLM
    response_text = qa_pipeline(formatted_prompt)[0]["generated_text"]

    # to Extract sources
    # sources = [doc.metadata.get("source", "Unknown") for doc in results]

    return f"\n💡 Response: {response_text}"

# 💬 **Interactive Chat**
while True:
    query = input("\n💬 Ask a question (or type 'exit' to quit): ")
    if query.lower() == "exit":
        print("Alright. Have a great day! :)")
        break

    response = answer_question(query)
    print(response)

#push the code to github



💬 Ask a question (or type 'exit' to quit): what does neov do?


Token indices sequence length is longer than the specified maximum sequence length for this model (1121 > 512). Running this sequence through the model will result in indexing errors



💡 Response: focuses on artificial intelligence ( AI ) and business process automation, helping organizations streamline operations and improve efficiency through advanced AI-driven solutions.

💬 Ask a question (or type 'exit' to quit): who is neov?

💡 Response: a rapidly growing consulting firm operating across the African continent, specializing in the insurance and fintech sectors.

💬 Ask a question (or type 'exit' to quit): how many projects is neov working on/

💡 Response: --- Recent Projects : 1. AI-Powered Legal Bot - Automates contract analysis and legal document generation - Helps law firms and legal departments reduce review time - Identifies key legal clauses for better workflow efficiency 2. Fraud Detection for Insurance - AI models analyze historical data and transaction patterns - Detects fraudulent claims and reduces financial losses - Enhances claims assessment accuracy for insurance companies

💬 Ask a question (or type 'exit' to quit): what's the ai-powered legal bot a

In [6]:
!git config --global user.name "saliSoul"
!git config --global user.email "rohiisalma1@gmail.com"


In [9]:
!git clone https://github.com/saliSoul/Q-A-ai-powered-System

Cloning into 'Q-A-ai-powered-System'...
fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
!mv /content/path_to_your_project/* /content/YOUR-REPO/


In [None]:
%cd /content/YOUR-REPO
!git add .
!git commit -m "Added my project files"
!git push https://USERNAME:TOKEN@github.com/YOUR-USERNAME/YOUR-REPO.git


In [1]:
GITHUB_REPO = "https://github.com/saliSoul/Q-A-ai-powered-System.git"
COMMIT_MESSAGE = "Initial commit: Hybrid Q&A System with ChromaDB + BM25"

!git config --global user.name "saliSoul"
!git config --global user.email "rohiisalma1@gmail.com"

!git init
!git remote add origin {GITHUB_REPO}

!git add .
!git commit -m "{COMMIT_MESSAGE}"

!git branch -M main
!git push -u origin main


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/
[master (root-commit) fef463d] Initial commit: Hybrid Q&A System with ChromaDB + BM25
 21 files changed, 51023 insertions(+)
 create mode 100644 .config/.last_opt_in_prompt.yaml
 create mode 100644 .config/.last_survey_prompt.yaml
 create mode 100644 .config/.last_update_check.json
 create mode 100644 .config/active_config
 create mode 100644 .config/config_sentinel
 create mode 100644 .config/configurations/config_default
 create mode 100644 .config/default_c

In [3]:
!git init
!git add README.md
!git commit -m "first commit"
!git branch -M main
!git remote add origin git@github.com:saliSoul/Q-A-ai-powered-System.git
!git push -u origin main

Reinitialized existing Git repository in /content/.git/
fatal: pathspec 'README.md' did not match any files
On branch main
nothing to commit, working tree clean
error: remote origin already exists.
fatal: could not read Username for 'https://github.com': No such device or address
