In [0]:

%pip install pypdf langchain faiss-cpu sentence-transformers
%pip install -qq -U llama-index pydantic PyPDF2
%pip install python-pptx
%pip install python-docx
%pip install PyPDF2
dbutils.library.restartPython()


In [0]:
%pip install -U langchain-community sentence-transformers python-pptx faiss-cpu
%pip install -U langchain-community sentence-transformers python-pptx faiss-cpu openpyxl pandas
import os
from pptx import Presentation
from docx import Document as DocxDocument
import PyPDF2

# ----------- Extractor Functions -----------

def extract_text_from_pptx(file_path):
    prs = Presentation(file_path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def extract_text_from_docx(file_path):
    doc = DocxDocument(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

import pandas as pd
from openpyxl import load_workbook

# ----------- Excel Extractor and Editor -----------

def extract_text_from_excel(file_path):
    excel_data = pd.read_excel(file_path, sheet_name=None)
    text = ""
    for sheet_name, df in excel_data.items():
        text += f"Sheet: {sheet_name}\n"
        text += df.to_string(index=False)
        text += "\n\n"
    return text

def edit_excel_file(file_path, sheet_name, cell, new_value):
    workbook = load_workbook(file_path)
    if sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        sheet[cell] = new_value
        workbook.save(file_path)
        print(f"✅ Edited {file_path} -> Sheet: {sheet_name}, Cell: {cell}, New Value: {new_value}")
    else:
        print(f"❌ Sheet '{sheet_name}' not found in {file_path}")


# ----------- File Collection and Extraction -----------

file_dir = "/Workspace/Users/rajesh.ghosh@xebia.com/Rag based application/Files/"
documents = []

for filename in os.listdir(file_dir):
    full_path = os.path.join(file_dir, filename)
    if filename.endswith(".pptx"):
        documents.append(extract_text_from_pptx(full_path))
    elif filename.endswith(".docx"):
        documents.append(extract_text_from_docx(full_path))
    elif filename.endswith(".pdf"):
        documents.append(extract_text_from_pdf(full_path))
    elif filename.endswith(".xlsx"):
        documents.append(extract_text_from_excel(full_path))

# ----------- Chunking the Text -----------

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.create_documents(documents)

# ----------- Embedding and FAISS Index -----------

from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_store = FAISS.from_documents(docs, embedding_model)

# Save index
index_dir = "your_index_storage"
faiss_store.save_local(index_dir)

# Load it back if needed
faiss_store = FAISS.load_local(
    folder_path=index_dir,
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

# ----------- Custom Prompt Setup for LLM -----------

from langchain.chat_models import ChatDatabricks
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import RetrievalQA

# Define your custom system prompt
system_prompt = """You are an AI Maturity Assessment expert bot designed to evaluate an organization's AI readiness and implementation maturity across key dimensions such as Strategy, Data, Technology, Talent, Use Cases, and Governance.

Your role is to ask context-relevant, structured questions, interpret corporate inputs (documents, slides, user text), and generate clear, concise, and non-repetitive insights or scoring feedback. You must follow this guidance:

- Maintain a **professional, neutral, and analytical tone**.
- Never make up information or assumptions. If something is unclear, ask for clarification.
- Only use the information provided in the documents.
- If the answer is not available in the documents, respond with: "The answer is not available in the provided materials."
- Do not use external knowledge or assumptions, even if the question seems answerable.
- Never ask follow-up questions or suggest next steps.
- Provide **brief, actionable insights or summaries**, using bullet points when applicable.
- Tailor your questions and feedback based on industry, organization size, and AI adoption stage if such context is available.
- Respect the confidentiality of all user-provided documents and context.
- When giving recommendations, **ground them in maturity best practices**, such as known frameworks (e.g., AI Maturity by McKinsey, Gartner, or Deloitte), without explicitly referencing them unless requested.
- Do not provide generic or vague answers; be context-specific based on input received.
- Use a **scoring system or maturity scale** only if specifically requested.
- Do not provide insight,analysis only Responses should be provided.

Example areas to probe (depending on input):
- Strategic alignment of AI with business goals
- Leadership and change management
- Data quality, accessibility, and infrastructure
- Talent readiness and upskilling efforts
- Use case maturity and value realization
- Ethics, security, and responsible AI practices"""

# Prompt template setup
prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt),
    HumanMessagePromptTemplate.from_template("Context:\n{context}\n\nQuestion: {question}")
])

# LLM Configuration
llm = ChatDatabricks(
    endpoint="databricks-llama-4-maverick",
    max_tokens=300
)

# Setup RetrievalQA chain using custom prompt
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=faiss_store.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import os
from pptx import Presentation
from docx import Document as DocxDocument
import PyPDF2

# ----------- Extractor Functions -----------

def extract_text_from_pptx(file_path):
    prs = Presentation(file_path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def extract_text_from_docx(file_path):
    doc = DocxDocument(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# ----------- File Collection and Extraction -----------

file_dir = "/Workspace/Users/rajesh.ghosh@xebia.com/Rag based application/Files/"
documents = []

for filename in os.listdir(file_dir):
    full_path = os.path.join(file_dir, filename)
    if filename.endswith(".pptx"):
        documents.append(extract_text_from_pptx(full_path))
    elif filename.endswith(".docx"):
        documents.append(extract_text_from_docx(full_path))
    elif filename.endswith(".pdf"):
        documents.append(extract_text_from_pdf(full_path))

# ----------- Chunking the Text -----------

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.create_documents(documents)

# ----------- Embedding and FAISS Index -----------

from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_store = FAISS.from_documents(docs, embedding_model)

# Save index
index_dir = "your_index_storage"
faiss_store.save_local(index_dir)

# Load it back if needed
faiss_store = FAISS.load_local(
    folder_path=index_dir,
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

# ----------- Custom Prompt Setup for LLM -----------

from langchain.chat_models import ChatDatabricks
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import RetrievalQA

# Define your custom system prompt
system_prompt = """You are a helpful AI Maturity Assessment consultant who extracts insights from corporate presentations.
Be concise, accurate, and avoid repetition. Respond in bullet points where appropriate. Do not make things up when you don't have context."""

# Prompt template setup
prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt),
    HumanMessagePromptTemplate.from_template("Context:\n{context}\n\nQuestion: {question}")
])

# LLM Configuration
llm = ChatDatabricks(
    endpoint="databricks-llama-4-maverick",
    max_tokens=300
)

# Setup RetrievalQA chain using custom prompt
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=faiss_store.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)


In [0]:

# ----------- Ask Questions -----------

questions = [
    "Who created this assessment",
    "Explain any one dimensions",
    "Explain what is quantum mechanics",
    "Explain why black Indian men wont fit in maturity assesment"
]

for q in questions:
    print(f"Q: {q}")
    print("A:", rag_chain.run(q))
    print("------")


Q: Who created this assessment
A: The assessment was created by the authors of the publication "AI Maturity Assessment and Alignment (AIMAA) - A Comprehensive Framework for Evaluating and Benchmarking AI Adoption in Organizations". The authors include Gopikrishnan Janakaraja from the Massachusetts Institute of Technology, among others.
------
Q: Explain any one dimensions
A: **Dimension 1: Strategy & Leadership**

This dimension assesses whether an organization has a structured AI strategy that is actively driven by leadership. It evaluates how well AI aligns with the business strategy, leadership commitment, and AI governance. A strong executive leadership and strategic direction are essential for AI maturity, as they enable organizations to treat AI as a core business driver rather than an experimental initiative. Key aspects of this dimension include:

* Alignment of AI with business strategy
* Leadership commitment to AI adoption
* Presence of a formal AI strategy and governance st

In [0]:
for i, doc in enumerate(docs[:5]):  # Show first 5 chunks
    print(f"--- Chunk {i+1} ---")
    print(doc.page_content)

--- Chunk 1 ---
AI Maturity Assessment

Assessment Levels
--- Chunk 2 ---
Focuses on Product Development and Support Teams, highlighting the application of AI throughout the Product Life Cycle, enhancements in productivity and quality, and AI-powered customer-centric approaches.
Product Teams (Business and IT, Dev and Support)
An organization is shaped by its people. This level focuses on individual team members, evaluating aspects such as their AI awareness, AI-driven innovation, and AI ethics.
Team Members
--- Chunk 3 ---
Team Members
Concentrates on organization-wide AI implementation, maturity levels, governance frameworks, and policy development.
Enterprise
Concentrates on the implementation of AI, automation, and enhanced turnaround times within knowledge and business areas, including HR, Legal, Finance, Marketing, and related functions.
Knowledge and Business Functions
1
2
3
4
--- Chunk 4 ---
Enterprise
Knowledge and Business Functions
Product Teams
(Business + IT,
Dev and Supp