Module 1: Cloud Data Source Connector Framework
Focus:

Connect to AdventureWorks (PostgreSQL or SQL Server)

Map schema into a clean JSON abstraction

Extract basic metadata: tables, columns, relationships

In [1]:
!pip install -q sentence-transformers faiss-cpu


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m103.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m842.1 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install -q langchain


In [3]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [4]:
!pip install groq

Collecting groq
  Downloading groq-0.22.0-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.22.0-py3-none-any.whl (126 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.7/126.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.22.0


In [115]:
biz_question ="What is the total revenue generated by each genre?"

In [116]:
import sqlite3
import json
import re
import os

#  STEP 1: Make sure your DB is in the Colab environment
# Upload manually if needed:
# from google.colab import files
# uploaded = files.upload()

# STEP 2: Point to the database file path
db_path = "/content/Chinook_Sqlite.sqlite"
assert os.path.exists(db_path), "Database file not found at /content"

# STEP 3: Connect and process
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Get all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = [row[0] for row in cursor.fetchall()]

def beautify_name(name):
    name = name.replace("_", " ").replace("ID", "Id")
    return re.sub(r'(?<!^)(?=[A-Z])', ' ', name).title()

def generate_column_description(name):
    return f"{beautify_name(name)} of the record."

enriched_schema = []

for table in tables:
    # Columns
    cursor.execute(f"PRAGMA table_info('{table}')")
    cols = cursor.fetchall()

    # Foreign keys
    cursor.execute(f"PRAGMA foreign_key_list('{table}')")
    fks = cursor.fetchall()

    # Row count
    try:
        cursor.execute(f"SELECT COUNT(*) FROM '{table}'")
        row_count = cursor.fetchone()[0]
    except:
        row_count = None

    enriched_schema.append({
        "table_name": table,
        "business_name": beautify_name(table),
        "description": f"Contains information related to {beautify_name(table)}.",
        "row_count": row_count,
        "columns": [{
            "column_name": col[1],
            "business_name": beautify_name(col[1]),
            "data_type": col[2],
            "is_primary_key": bool(col[5]),
            "description": generate_column_description(col[1])
        } for col in cols],
        "foreign_keys": [{
            "from_column": fk[3],
            "to_table": fk[2],
            "to_column": fk[4]
        } for fk in fks]
    })

conn.close()

# STEP 4: Save output
output_path = "/content/schema_metadata_enriched.json"
with open(output_path, "w") as f:
    json.dump(enriched_schema, f, indent=2)

print(f"Enriched schema saved to {output_path}")


Enriched schema saved to /content/schema_metadata_enriched.json


In [117]:
import json
import re

# Load previously enriched schema
with open("/content/schema_metadata_enriched.json", "r") as f:
    enriched_schema = json.load(f)

# Sample term dictionary (can be expanded)
business_term_dict =  {
    # Album/Artist
    "AlbumId": "Album Identifier",
    "Title (Album)": "Album Title",
    "ArtistId": "Artist Identifier",
    "Name (Artist)": "Artist Name",

    # Customer
    "CustomerId": "Customer Identifier",
    "FirstName (Customer)": "First Name", # Corrected as requested
    "LastName (Customer)": "Last Name", # Corrected as requested
    "Company (Customer)": "Customer Company Name",
    "Address (Customer)": "Customer Address",
    "City (Customer)": "Customer City",
    "State (Customer)": "Customer State",
    "Country (Customer)": "Customer Country",
    "PostalCode (Customer)": "Customer Postal Code",
    "Phone (Customer)": "Customer Phone Number",
    "Fax (Customer)": "Customer Fax Number",
    "Email (Customer)": "Customer Email Address",
    "SupportRepId": "Customer Support Representative Identifier",

    # Employee
    "EmployeeId": "Employee Identifier",
    "LastName (Employee)": "Last Name", # Corrected as requested
    "FirstName (Employee)": "First Name", # Corrected as requested
    "Title (Employee)": "Employee Job Title",
    "ReportsTo": "Employee Manager Identifier",
    "BirthDate": "Employee Birth Date",
    "HireDate": "Employee Hire Date",
    "Address (Employee)": "Employee Address",
    "City (Employee)": "Employee City",
    "State (Employee)": "Employee State",
    "Country (Employee)": "Employee Country",
    "PostalCode (Employee)": "Employee Postal Code",
    "Phone (Employee)": "Employee Phone Number",
    "Fax (Employee)": "Employee Fax Number",
    "Email (Employee)": "Employee Email Address",

    # Genre/Media Type
    "GenreId": "Genre Identifier",
    "Name (Genre)": "Genre Name",
    "MediaTypeId": "Media Type Identifier",
    "Name (MediaType)": "Media Type Name",

    # Invoice
    "InvoiceId": "Invoice Identifier",
    "CustomerId (Invoice)": "Customer Identifier",
    "InvoiceDate": "Invoice Date",
    "BillingAddress": "Invoice Billing Address",
    "BillingCity": "Invoice Billing City",
    "BillingState": "Invoice Billing State",
    "BillingCountry": "Invoice Billing Country",
    "BillingPostalCode": "Invoice Billing Postal Code",
    "Total": "Invoice Total Amount",

    # Invoice Line
    "InvoiceLineId": "Invoice Line Identifier",
    "InvoiceId (InvoiceLine)": "Invoice Identifier",
    "TrackId (InvoiceLine)": "Track Identifier",
    "UnitPrice (InvoiceLine)": "Invoice Line Unit Price",
    "Quantity": "Invoice Line Quantity",

    # Playlist/Track
    "PlaylistId": "Playlist Identifier",
    "Name (Playlist)": "Playlist Name",
    "TrackId (PlaylistTrack)": "Track Identifier",
    "Name (Track)": "Track Name",
    "Composer": "Track Composer",
    "Milliseconds": "Track Duration (Milliseconds)",
    "Bytes": "Track File Size (Bytes)",
    "UnitPrice (Track)": "Track Unit Price",
}

# Print the dictionary (optional)
for term, definition in business_term_dict.items():
    print(f"{term}: {definition}")

# Normalize keys for matching
normalized_dict = {k.lower(): v for k, v in business_term_dict.items()}

# Enrich business_name, tagging, and description
for table in enriched_schema:
    table_name = table["table_name"].lower()

    for col in table["columns"]:
        col_key = col["column_name"].lower().replace("_", "")
        business_term = normalized_dict.get(col_key)

        if business_term:
            col["business_name"] = business_term
            col["business_term_tagged"] = True
            col["description"] = f"{business_term} of the {table['table_name']}."
        else:
            col["business_name"] = col["column_name"]
            col["business_term_tagged"] = False

            # Context-sensitive description fallback
            if "customer" in table_name:
                col["description"] = f"{col['column_name']} of the customer."
            elif "invoice" in table_name:
                col["description"] = f"{col['column_name']} related to the invoice."
            elif "track" in table_name:
                col["description"] = f"{col['column_name']} of the track or song."
            elif "employee" in table_name:
                col["description"] = f"{col['column_name']} of the employee."
            else:
                col["description"] = f"{col['column_name']} of the record."
            # Context-sensitive description fallback
            if "customer" in table_name:
              col["description"] = f"{col['column_name']} of the customer."
            elif "invoice" in table_name:
              col["description"] = f"{col['column_name']} related to the invoice."
            elif "track" in table_name:
              col["description"] = f"{col['column_name']} of the track or song."
            elif "employee" in table_name:
              col["description"] = f"{col['column_name']} of the employee."
            elif "artist" in table_name:
              col["description"] = f"{col['column_name']} related to the artist or band."
            elif "album" in table_name:
                col["description"] = f"{col['column_name']} related to the album."
            elif "playlist" in table_name:
                col["description"] = f"{col['column_name']} related to the playlist."
            elif "mediatype" in table_name:
                col["description"] = f"{col['column_name']} describing the media format."
            elif "genre" in table_name:
                col["description"] = f"{col['column_name']} representing the music genre."
            else:
                col["description"] = f"{col['column_name']} of the record."



# Save updated schema
output_path = "/content/schema_metadata_enriched_with_tags.json"
with open(output_path, "w") as f:
    json.dump(enriched_schema, f, indent=2)

print(f" Business term tagging completed and saved to {output_path}")


AlbumId: Album Identifier
Title (Album): Album Title
ArtistId: Artist Identifier
Name (Artist): Artist Name
CustomerId: Customer Identifier
FirstName (Customer): First Name
LastName (Customer): Last Name
Company (Customer): Customer Company Name
Address (Customer): Customer Address
City (Customer): Customer City
State (Customer): Customer State
Country (Customer): Customer Country
PostalCode (Customer): Customer Postal Code
Phone (Customer): Customer Phone Number
Fax (Customer): Customer Fax Number
Email (Customer): Customer Email Address
SupportRepId: Customer Support Representative Identifier
EmployeeId: Employee Identifier
LastName (Employee): Last Name
FirstName (Employee): First Name
Title (Employee): Employee Job Title
ReportsTo: Employee Manager Identifier
BirthDate: Employee Birth Date
HireDate: Employee Hire Date
Address (Employee): Employee Address
City (Employee): Employee City
State (Employee): Employee State
Country (Employee): Employee Country
PostalCode (Employee): Emplo

In [118]:
import pprint

# Print each table's metadata cleanly
for table in enriched_schema:
    print(f"\n Table: {table['table_name']} ({table['business_name']})")
    pprint.pprint(table, indent=2)
    print("=" * 80)


 Table: Album (Album)
{ 'business_name': 'Album',
  'columns': [ { 'business_name': 'Album Identifier',
                 'business_term_tagged': True,
                 'column_name': 'AlbumId',
                 'data_type': 'INTEGER',
                 'description': 'Album Identifier of the Album.',
                 'is_primary_key': True},
               { 'business_name': 'Title',
                 'business_term_tagged': False,
                 'column_name': 'Title',
                 'data_type': 'NVARCHAR(160)',
                 'description': 'Title related to the album.',
                 'is_primary_key': False},
               { 'business_name': 'Artist Identifier',
                 'business_term_tagged': True,
                 'column_name': 'ArtistId',
                 'data_type': 'INTEGER',
                 'description': 'Artist Identifier of the Album.',
                 'is_primary_key': False}],
  'description': 'Contains information related to Album.',
  'foreign_key

# Module 3

In [119]:
# Re-run after kernel reset: setup and FAISS indexing logic

from sentence_transformers import SentenceTransformer
import faiss
import json
import os


# Load enriched schema with tags
with open("schema_metadata_enriched_with_tags.json", "r") as f:
    schema = json.load(f)

# Initialize embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Prepare text chunks and metadata
chunks = []
metadata = []

for table in schema:
    chunk_text = f"Table: {table['business_name']} ({table['table_name']})\n"
    chunk_text += f"Description: {table['description']}\n"
    chunk_text += "Columns:\n"
    for col in table["columns"]:
        chunk_text += f" - {col['business_name']} ({col['column_name']}): {col['description']}\n"
    if table["foreign_keys"]:
        chunk_text += "Relationships:\n"
        for fk in table["foreign_keys"]:
            chunk_text += f" - {fk['from_column']} → {fk['to_table']}.{fk['to_column']}\n"
    chunks.append(chunk_text)
    metadata.append({
        "table_name": table["table_name"],
        "business_name": table["business_name"],
        "num_columns": len(table["columns"]),
        "num_foreign_keys": len(table["foreign_keys"])
    })

# Generate embeddings
embeddings = model.encode(chunks)

# Create FAISS index
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# Example inputs: chunks and model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
docs = [Document(page_content=chunk) for chunk in chunks]

# Create vectorstore
vectorstore = FAISS.from_documents(docs, embedding_model)

# Save to a folder (not a file)
vectorstore.save_local("schema_faiss")

with open("schema_chunks.json", "w") as f:
    json.dump(chunks, f, indent=2)
with open("schema_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print("FAISS index and chunk metadata saved.")


FAISS index and chunk metadata saved.


In [120]:
# Re-import after runtime reset (safe to run again in Colab)

import json
import faiss
from sentence_transformers import SentenceTransformer
from google.colab import files

# Load the enriched schema
with open("schema_metadata_enriched_with_tags.json", "r") as f:
    schema = json.load(f)

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Prepare column-level chunks
column_chunks = []
column_metadata = []

for table in schema:
    for col in table["columns"]:
        chunk_text = f"Table: {table['business_name']} ({table['table_name']})\n"
        chunk_text += f"Column: {col['business_name']} ({col['column_name']})\n"
        chunk_text += f"Type: {col['data_type']}\n"
        chunk_text += f"Description: {col['description']}\n"
        chunk_text += f"Primary Key: {'Yes' if col['is_primary_key'] else 'No'}\n"
        chunk_text += f"Business Term Tagged: {'Yes' if col.get('business_term_tagged') else 'No'}\n"
         # Inject foreign key info if applicable
        for fk in table["foreign_keys"]:
            if fk["from_column"] == col["column_name"]:
                chunk_text += f"Foreign Key: {fk['from_column']} → {fk['to_table']}.{fk['to_column']}\n"

        column_chunks.append(chunk_text)
        column_metadata.append({
            "table_name": table["table_name"],
            "column_name": col["column_name"],
            "business_name": col["business_name"],
            "is_primary_key": col["is_primary_key"]
        })

# Embed column-level chunks
column_embeddings = model.encode(column_chunks)

# Create FAISS index for column-level embeddings
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# Assume `column_chunks` contains your list of string chunks
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
column_docs = [Document(page_content=chunk) for chunk in column_chunks]

# Build and save
column_vectorstore = FAISS.from_documents(column_docs, embedding_model)
column_vectorstore.save_local("schema_column_faiss")  # <- save in a folder with this exact name


with open("column_chunks.json", "w") as f:
    json.dump(column_chunks, f, indent=2)
with open("column_metadata.json", "w") as f:
    json.dump(column_metadata, f, indent=2)

print("Column-level FAISS index and metadata saved.")


Column-level FAISS index and metadata saved.


In [121]:
for i, chunk in enumerate(column_chunks[:5]):
    print(f"\n--- Column Chunk {i+1} ---\n{chunk}")


--- Column Chunk 1 ---
Table: Album (Album)
Column: Album Identifier (AlbumId)
Type: INTEGER
Description: Album Identifier of the Album.
Primary Key: Yes
Business Term Tagged: Yes


--- Column Chunk 2 ---
Table: Album (Album)
Column: Title (Title)
Type: NVARCHAR(160)
Description: Title related to the album.
Primary Key: No
Business Term Tagged: No


--- Column Chunk 3 ---
Table: Album (Album)
Column: Artist Identifier (ArtistId)
Type: INTEGER
Description: Artist Identifier of the Album.
Primary Key: No
Business Term Tagged: Yes
Foreign Key: ArtistId → Artist.ArtistId


--- Column Chunk 4 ---
Table: Artist (Artist)
Column: Artist Identifier (ArtistId)
Type: INTEGER
Description: Artist Identifier of the Artist.
Primary Key: Yes
Business Term Tagged: Yes


--- Column Chunk 5 ---
Table: Artist (Artist)
Column: Name (Name)
Type: NVARCHAR(120)
Description: Name related to the artist or band.
Primary Key: No
Business Term Tagged: No



# Module 4

In [122]:
import json
import faiss
from sentence_transformers import SentenceTransformer

# Placeholder chunks representing business knowledge (simulated from genbi.pdf)
genbi_chunks = [
    "Business Rule: Total Earnings = LineTotal * AverageRate",
    "Definition: Sales Territory refers to the geographic region where a sale is made.",
    "KPI: Average Order Value is calculated as Total Sales / Number of Orders",
    "Term Mapping: 'Earnings' maps to 'LineTotal * CurrencyRate'",
    "Metric: Customer Retention Rate is a key metric tracked monthly."
]

# Embed with sentence-transformers
model = SentenceTransformer("all-MiniLM-L6-v2")
genbi_embeddings = model.encode(genbi_chunks)

# Create FAISS index
dim = genbi_embeddings[0].shape[0]
genbi_index = faiss.IndexFlatL2(dim)
genbi_index.add(genbi_embeddings)

# Save outputs
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# Assume `chunks` and `embeddings` already exist
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

docs = [Document(page_content=chunk) for chunk in chunks]

vectorstore = FAISS.from_documents(docs, embedding_model)
vectorstore.save_local("genbi_faiss.index")  # This creates BOTH .faiss and .pkl files

with open("genbi_chunks.json", "w") as f:
    json.dump(genbi_chunks, f, indent=2)

print("Placeholder business knowledge base saved as genbi_faiss.index and genbi_chunks.json")


Placeholder business knowledge base saved as genbi_faiss.index and genbi_chunks.json


# Module 5

In [123]:
# LangChain-based RAG Core Setup for Module 5: Retrieval Engine

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.retrievers import EnsembleRetriever
import faiss
import json

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load vectorstores
table_vectorstore = FAISS.load_local(
    folder_path="schema_faiss",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

column_vectorstore = FAISS.load_local(
    folder_path="schema_column_faiss",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

try:
    genbi_vectorstore = FAISS.load_local(
        folder_path="genbi_faiss",
        embeddings=embedding_model
    )
except Exception:
    genbi_vectorstore = None

# Define your query
user_question = biz_question

# Individually retrieve relevant docs
table_docs = table_vectorstore.as_retriever(search_kwargs={"k": 5}).get_relevant_documents(user_question)
column_docs = column_vectorstore.as_retriever(search_kwargs={"k": 5}).get_relevant_documents(user_question)
genbi_docs = genbi_vectorstore.as_retriever(search_kwargs={"k": 3}).get_relevant_documents(user_question) if genbi_vectorstore else []

# Combine and deduplicate schema chunks
schema_chunks = list(set(doc.page_content for doc in (table_docs + column_docs)))

# Separate business chunks
business_chunks = list(set(doc.page_content for doc in genbi_docs))

# Optional: print for inspection
print("\n Schema Chunks:")
for i, chunk in enumerate(schema_chunks):
    print(f"\n--- Chunk {i+1} ---\n{chunk}")

print("\nBusiness Chunks:")
for i, chunk in enumerate(business_chunks):
    print(f"\n--- Chunk {i+1} ---\n{chunk}")





 Schema Chunks:

--- Chunk 1 ---
Table: Album (Album)
Description: Contains information related to Album.
Columns:
 - Album Identifier (AlbumId): Album Identifier of the Album.
 - Title (Title): Title related to the album.
 - Artist Identifier (ArtistId): Artist Identifier of the Album.
Relationships:
 - ArtistId → Artist.ArtistId


--- Chunk 2 ---
Table: Track (Track)
Column: UnitPrice (UnitPrice)
Type: NUMERIC(10,2)
Description: UnitPrice of the track or song.
Primary Key: No
Business Term Tagged: No


--- Chunk 3 ---
Table: Genre (Genre)
Description: Contains information related to Genre.
Columns:
 - Genre Identifier (GenreId): Genre Identifier of the Genre.
 - Name (Name): Name representing the music genre.


--- Chunk 4 ---
Table: Artist (Artist)
Column: Name (Name)
Type: NVARCHAR(120)
Description: Name related to the artist or band.
Primary Key: No
Business Term Tagged: No


--- Chunk 5 ---
Table: Genre (Genre)
Column: Genre Identifier (GenreId)
Type: INTEGER
Description: Genre 

In [124]:
for chunk in schema_chunks:
    print(chunk)

Table: Album (Album)
Description: Contains information related to Album.
Columns:
 - Album Identifier (AlbumId): Album Identifier of the Album.
 - Title (Title): Title related to the album.
 - Artist Identifier (ArtistId): Artist Identifier of the Album.
Relationships:
 - ArtistId → Artist.ArtistId

Table: Track (Track)
Column: UnitPrice (UnitPrice)
Type: NUMERIC(10,2)
Description: UnitPrice of the track or song.
Primary Key: No
Business Term Tagged: No

Table: Genre (Genre)
Description: Contains information related to Genre.
Columns:
 - Genre Identifier (GenreId): Genre Identifier of the Genre.
 - Name (Name): Name representing the music genre.

Table: Artist (Artist)
Column: Name (Name)
Type: NVARCHAR(120)
Description: Name related to the artist or band.
Primary Key: No
Business Term Tagged: No

Table: Genre (Genre)
Column: Genre Identifier (GenreId)
Type: INTEGER
Description: Genre Identifier of the Genre.
Primary Key: Yes
Business Term Tagged: Yes

Table: Genre (Genre)
Column: Name

# Module 6


**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**

In [125]:
# Run this to set your API key securely
import os
os.environ["GROQ_API_KEY"] = "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"  # Replace this with your actual key


In [126]:
def get_system_prompt():
    return "You are a helpful assistant that generates SQL queries for a database using schema and business logic."

def get_sql_generation_prompt(user_question, schema_context, business_rules_context, examples=None):
    example_block = ""
    if examples:
        for q, sql in examples:
            example_block += f"Example Question: {q}\nExample SQL:\n{sql.strip()}\n\n"

    return f"""
You are a helpful assistant that generates SQL queries based on a user's question, the database schema, and relevant business rules.

Respond in this format:

Question:
{user_question}

Schema:
{schema_context}

Business Rules:
{business_rules_context}

SQL:
<Write the SQL query that answers the question>

Explanation:
<Briefly explain what the SQL query does in business terms>

Chart:
<Suggest a chart type suitable for visualizing the result>

Guidelines:
- Use only the tables and columns provided in the schema context.
- - Use the exact table and column names as shown in the schema context.
- Join tables only when there is a foreign key relationship.
- Apply business rules exactly as described when calculating KPIs or metrics.
- Use clear, aliased column names suitable for visualization.
- Avoid guessing any data model structures not present in the context.
- First, return the SQL code.
- Then, explain what the query does in plain English.
- If foreign key relationships are given in the schema, use them when writing JOIN clauses. Do not guess or write JOIN 1=1.
- Use only the exact column names shown in parentheses in the schema (e.g., "BillingCity"), not the business names.
- This is a SQLite database.
- Do not use SQL Server-specific functions like GETDATE(), DATEDIFF(), or YEAR().
- Use SQLite-compatible date functions such as julianday(), date(), and CURRENT_DATE.
- Use julianday() and CURRENT_DATE for date operations if needed.
- Always use InvoiceLine for revenue-related calculations
- For any revenue-related question, ALWAYS use InvoiceLine.UnitPrice * InvoiceLine.Quantity.
- Never use Track.UnitPrice unless the user specifically asks for "list price" or "track price."
- To calculate total revenue by artist, you must join: Artist → Album → Track → InvoiceLine.
- Do not invent column names. Use only those shown in the schema
- Whenever revenue needs to be calculated, ALWAYS use InvoiceLine table


""".strip()



In [127]:
#Module 8 - Self-healing prompt builder

def get_self_healing_prompt(user_question, original_sql, error_message, schema_context=""):
    return f"""
You are an expert SQL assistant. A SQL query was generated for the question below but it caused an error during execution.

Question:
{user_question}

Original SQL:
{original_sql}

SQL Error Message:
{error_message}

Schema:
{schema_context}

Your task:
- Analyze the original SQL and the error.
- Regenerate a corrected SQL query using **only** tables and columns from the schema above.
- Provide a brief explanation of the fix.
- Suggest a chart to visualize the result if applicable.
- Regenerate a corrected SQL query using **only** tables and columns from the schema above.
- This is a SQLite database.
- Do not use SQL Server-specific functions like GETDATE(), DATEDIFF(), or YEAR().
- Use SQLite-compatible date functions such as julianday(), date(), and CURRENT_DATE.
- This is a SQLite database.
- Use julianday() and CURRENT_DATE for date operations if needed.
- If the error message mentions a column that doesn't exist, double-check the schema chunk above for the correct name and replace it. Do not repeat invalid column names in the fixed query.
- If a column error is detected, always refer to the exact column names under 'Column:' from the schema chunk. For example, if the schema lists 'Name' and not 'Title', use 'Name'.
- Always use InvoiceLine for revenue-related calculations
- For any revenue-related question, ALWAYS use InvoiceLine.UnitPrice * InvoiceLine.Quantity.
- Never use Track.UnitPrice unless the user specifically asks for "list price" or "track price."
- To calculate total revenue by artist, you must join: Artist → Album → Track → InvoiceLine.
- Do not invent column names. Use only those shown in the schema
- Whenever revenue needs to be calculated, ALWAYS use InvoiceLine table





Respond in this format:

SQL:
<Corrected SQL>

Explanation:
<Brief fix explanation>

Chart:
<Chart suggestion>
""".strip()


In [128]:
import os
from groq import Groq

# Setup Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# Example retrieved data
user_question = biz_question
schema_chunks = list(set(doc.page_content for doc in (table_docs + column_docs)))
business_chunks = list(set(doc.page_content for doc in genbi_docs)) if genbi_vectorstore else []
example_qa_pairs = [
    ("List all products and their prices", "SELECT Name, ListPrice FROM Production.Product;"),
    ("How many orders per customer?", "SELECT CustomerID, COUNT(*) FROM Sales.SalesOrderHeader GROUP BY CustomerID;")
]

# Build the prompt
system_prompt = get_system_prompt()
user_prompt = get_sql_generation_prompt(
    user_question=user_question,
    schema_context="\n".join(schema_chunks),
    business_rules_context="\n".join(business_chunks),
    examples=example_qa_pairs
)

# Groq model call
chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    model="gemma2-9b-it"  # You can change to llama-3.3-70b-versatile later
)

# Output
print("Groq Response:")
print(chat_completion.choices[0].message.content)


Groq Response:
```sql
SELECT g.Name AS Genre, SUM(il.UnitPrice * il.Quantity) AS TotalRevenue
FROM Genre g
JOIN Track t ON g.GenreId = t.GenreId
JOIN InvoiceLine il ON t.TrackId = il.TrackId
GROUP BY g.Name
ORDER BY TotalRevenue DESC;
```

**Explanation:**

This query calculates the total revenue generated by each music genre. 

It joins the `Genre`, `Track`, and `InvoiceLine` tables based on their relationships. It then groups the results by genre (`g.Name`) and calculates the sum of revenue for each genre using `SUM(il.UnitPrice * il.Quantity)`. Finally, it orders the results in descending order of total revenue.


**Chart:**

A bar chart would be suitable for visualizing this result. The x-axis would represent the genre names, and the y-axis would represent the total revenue for each genre. 



# Module 7

In [129]:
import re
import sqlite3
import json

# --- STEP 1: Extract structured sections from the LLM response ---

def parse_llm_response(response_text):
    sections = {"sql": None, "explanation": None, "chart": None}

    # Extract SQL block with or without "SQL:" label
    sql_match = re.search(r"```sql(.*?)```", response_text, re.DOTALL | re.IGNORECASE)
    if not sql_match:
        # fallback: look for first SELECT statement
        sql_match = re.search(r"(SELECT[\s\S]+?;)", response_text, re.IGNORECASE)

    sections["sql"] = sql_match.group(1).strip() if sql_match else None

    # Extract Explanation
    explanation_match = re.search(r"(?i)Explanation:\s*(.*?)(?:\nChart:|$)", response_text, re.DOTALL)
    sections["explanation"] = explanation_match.group(1).strip() if explanation_match else None

    # Extract Chart suggestion
    chart_match = re.search(r"(?i)Chart:\s*(.*)", response_text)
    sections["chart"] = chart_match.group(1).strip() if chart_match else None

    return sections

# --- STEP 2: SQL syntax check using sqlite3 ---

def check_sql_syntax(sql_query):
    try:
        conn = sqlite3.connect(db_path)
        conn.execute("EXPLAIN QUERY PLAN " + sql_query)
        return True, None
    except sqlite3.Error as e:
        return False, str(e)
    finally:
        conn.close()


# --- STEP 3: Reference check against JSON schema ---

def validate_sql_references(sql_query, schema_json_path="/content/schema_metadata_enriched_with_tags.json"):
    with open(schema_json_path, "r") as f:
        schema = json.load(f)

    valid_tables = {t["table_name"].lower(): {col["column_name"].lower() for col in t["columns"]} for t in schema}
    found_tables = set()
    found_columns = set()

    tokens = re.findall(r"\b\w+\b", sql_query.lower())
    for token in tokens:
        if token in valid_tables:
            found_tables.add(token)
        else:
            for table, columns in valid_tables.items():
                if token in columns:
                    found_columns.add(token)

    missing_tables = set()
    missing_columns = set()


    # Basic table and column reference checks
    for token in re.findall(r"from\s+(\w+)|join\s+(\w+)", sql_query.lower()):
        table = next(filter(None, token))
        if table and table not in valid_tables:
            missing_tables.add(table)

    for col in found_columns:
        col_in_schema = any(col in cols for cols in valid_tables.values())
        if not col_in_schema:
            missing_columns.add(col)

    is_valid = not missing_tables and not missing_columns
    return is_valid, missing_tables, missing_columns



In [130]:
# 1. Extract LLM output text from Module 6 response
llm_response = chat_completion.choices[0].message.content

# 2. Feed it directly into your Module 7 validator
parsed = parse_llm_response(llm_response)

# 3. Validate syntax
if parsed["sql"]:
    syntax_ok, syntax_error = check_sql_syntax(parsed["sql"])
    ref_ok, missing_tables, missing_columns = validate_sql_references(parsed["sql"])

    print("Syntax OK" if syntax_ok else f"Syntax Error: {syntax_error}")
    print("Schema Check Passed" if ref_ok else f"Missing: Tables={missing_tables}, Columns={missing_columns}")

else:
    print(" No SQL block found in LLM response.")

# Print explanation and chart regardless
print("\nExplanation:\n", parsed["explanation"] or "No explanation found.")
print("\nChart Suggestion:\n", parsed["chart"] or "No chart suggestion found.")




Syntax OK
Schema Check Passed

Explanation:
 **

This query calculates the total revenue generated by each music genre. 

It joins the `Genre`, `Track`, and `InvoiceLine` tables based on their relationships. It then groups the results by genre (`g.Name`) and calculates the sum of revenue for each genre using `SUM(il.UnitPrice * il.Quantity)`. Finally, it orders the results in descending order of total revenue.


**Chart:**

A bar chart would be suitable for visualizing this result. The x-axis would represent the genre names, and the y-axis would represent the total revenue for each genre.

Chart Suggestion:
 **


In [131]:
parsed = parse_llm_response(llm_response)

# 🔍 Print the parsed SQL
print("📦 Parsed SQL Block:")
print(parsed["sql"] or " No SQL found.")

📦 Parsed SQL Block:
SELECT g.Name AS Genre, SUM(il.UnitPrice * il.Quantity) AS TotalRevenue
FROM Genre g
JOIN Track t ON g.GenreId = t.GenreId
JOIN InvoiceLine il ON t.TrackId = il.TrackId
GROUP BY g.Name
ORDER BY TotalRevenue DESC;


In [132]:
import sqlite3
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print([row[0] for row in cursor.fetchall()])
conn.close()

['Album', 'Artist', 'Customer', 'Employee', 'Genre', 'Invoice', 'InvoiceLine', 'MediaType', 'Playlist', 'PlaylistTrack', 'Track']


# MOdule 8

In [133]:
import sqlite3

def execute_sql_with_self_heal(original_sql, user_question, schema_chunks, groq_client, model="gemma2-9b-it"):
    try:
        # Try executing the original SQL first
        conn = sqlite3.connect(db_path)
        conn.execute("EXPLAIN QUERY PLAN " + original_sql)
        conn.close()

        print("SQL executed successfully. No healing needed.")
        return {
            "final_sql": original_sql,
            "was_healed": False,
            "error_before_healing": None,
            "explanation": None,
            "chart": None,
            "status": "original_executed_successfully"
        }

    except sqlite3.Error as e:
        error_message = str(e)
        print(f"Execution error: {error_message}")
        print("Triggering LLM-based self-healing...")

        # Build retry prompt
        schema_context = "\n".join(schema_chunks)
        retry_prompt = get_self_healing_prompt(
            user_question=user_question,
            original_sql=original_sql,
            error_message=error_message,
            schema_context=schema_context
        )

        # Send to Groq
        retry_response = groq_client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are an expert SQL assistant that fixes broken queries."},
                {"role": "user", "content": retry_prompt}
            ],
            model=model
        )

        retry_text = retry_response.choices[0].message.content
        print("LLM Retry Output:\n", retry_text)

        #  Parse output
        parsed_retry = parse_llm_response(retry_text)
        if parsed_retry["sql"]:
            syntax_ok, _ = check_sql_syntax(parsed_retry["sql"])
            ref_ok, _, _ = validate_sql_references(parsed_retry["sql"])

            if syntax_ok and ref_ok:
                print(" Healed SQL passed validation.")
                return {
                    "final_sql": parsed_retry["sql"],
                    "was_healed": True,
                    "error_before_healing": error_message,
                    "explanation": parsed_retry.get("explanation"),
                    "chart": parsed_retry.get("chart"),
                    "status": "healed"
                }

        print("Healing failed.")
        return {
            "final_sql": None,
            "was_healed": True,
            "error_before_healing": error_message,
            "explanation": None,
            "chart": None,
            "status": "healing_failed"
        }


In [134]:
result = execute_sql_with_self_heal(
    original_sql=parsed["sql"],
    user_question=biz_question,
    schema_chunks=schema_chunks,
    groq_client=client
)

print("Final SQL:", result["final_sql"])
print("Explanation:", result["explanation"])
print("Chart Suggestion:", result["chart"])
print("Status:", result["status"])


SQL executed successfully. No healing needed.
Final SQL: SELECT g.Name AS Genre, SUM(il.UnitPrice * il.Quantity) AS TotalRevenue
FROM Genre g
JOIN Track t ON g.GenreId = t.GenreId
JOIN InvoiceLine il ON t.TrackId = il.TrackId
GROUP BY g.Name
ORDER BY TotalRevenue DESC;
Explanation: None
Chart Suggestion: None
Status: original_executed_successfully


Combining both module 7 and 8

In [135]:
def validate_then_execute_with_optional_heal(sql_text, user_question, schema_chunks, groq_client, db_path, model="gemma2-9b-it"):
    print(" Validating SQL in Module 7:")
    print(sql_text)

    syntax_ok, syntax_error = check_sql_syntax(sql_text)
    ref_ok, missing_tables, missing_columns = validate_sql_references(sql_text)

    print("Syntax OK:" if syntax_ok else f"Syntax error: {syntax_error}")
    print("Schema OK:" if ref_ok else f"Missing: Tables={missing_tables}, Columns={missing_columns}")

    if not syntax_ok or not ref_ok:
        return {
            "final_sql": None,
            "was_healed": False,
            "error_before_healing": syntax_error or f"Missing: {missing_tables}, {missing_columns}",
            "explanation": None,
            "chart": None,
            "status": "module_7_validation_failed"
        }

    # Step 2: Try execution
    try:
        conn = sqlite3.connect(db_path)
        conn.execute("EXPLAIN QUERY PLAN " + sql_text)
        conn.close()
        return {
            "final_sql": sql_text,
            "was_healed": False,
            "error_before_healing": None,
            "explanation": None,
            "chart": None,
            "status": "original_executed_successfully"
        }

    except sqlite3.Error as e:
        error_message = str(e)
        print(f"Execution error: {error_message}")

        # Step 3: Heal
        fixed_sql, retry_output = execute_sql_with_self_heal(
            original_sql=sql_text,
            user_question=user_question,
            schema_chunks=schema_chunks,
            groq_client=groq_client,
            model=model
        )

        if fixed_sql:
            if fixed_sql.strip() != sql_text.strip():
                print("Healing Success: SQL was modified")
                print("\nOriginal SQL:\n", sql_text)
                print("\nHealed SQL:\n", fixed_sql)

            return {
                "final_sql": fixed_sql,
                "was_healed": True,
                "error_before_healing": error_message,
                "explanation": retry_output.get("explanation") if retry_output else None,
                "chart": retry_output.get("chart") if retry_output else None,
                "status": "healed"
            }

        return {
            "final_sql": None,
            "was_healed": True,
            "error_before_healing": error_message,
            "explanation": None,
            "chart": None,
            "status": "healing_failed"
        }


In [136]:
"""import json
with open("/content/schema_metadata_enriched_with_tags.json") as f:
    schema = json.load(f)

table_names = [t["table_name"] for t in schema]
print("📋 Tables extracted from schema:\n", table_names)"""

'import json\nwith open("/content/schema_metadata_enriched_with_tags.json") as f:\n    schema = json.load(f)\n\ntable_names = [t["table_name"] for t in schema]\nprint("📋 Tables extracted from schema:\n", table_names)'

In [137]:
"""for table in schema:
    print(f"\n🧾 Table: {table['table_name']}")
    print("Columns:", [col["column_name"] for col in table["columns"]])"""

'for table in schema:\n    print(f"\n🧾 Table: {table[\'table_name\']}")\n    print("Columns:", [col["column_name"] for col in table["columns"]])'

Module 9 Plan with Knowledge Base Placeholder
We’ll:

Retrieve the business chunks if the knowledge base (genbi_vectorstore) is available

Else, just pass an empty list or placeholder string

Prompt the LLM to generate a clear, business-friendly insight from the SQL + original question

In [138]:
def generate_sql_insight(user_question, final_sql, genbi_vectorstore=None, groq_client=None, model="gemma2-9b-it"):
    # Step 1: Try to retrieve relevant business rules (if knowledge base is enabled)
    if genbi_vectorstore:
        genbi_docs = genbi_vectorstore.as_retriever(search_kwargs={"k": 3}).get_relevant_documents(user_question)
        business_chunks = list(set(doc.page_content for doc in genbi_docs))
    else:
        business_chunks = ["<Business rules are not currently available.>"]

    # Step 2: Build the explanation prompt
    prompt = f"""
You are a helpful business assistant.

User Question:
{user_question}

SQL Query:
{final_sql}

Business Rules:
{chr(10).join(business_chunks)}

Your tasks:
- Summarize the kind of insight or KPI this query would produce
- Suggest 1–2 follow-up business questions based on this data

Respond in this format:

Explanation:
<Your plain-language explanation>

Insight Summary:
<What insight or KPI this generates>

Follow-up Questions:
- ...
- ...
""".strip()

    # Step 3: Call Groq LLM
    response = groq_client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are an expert in translating SQL queries into business insights."},
            {"role": "user", "content": prompt}
        ],
        model=model
    )

    # Step 4: Return LLM Output
    return response.choices[0].message.content


In [139]:
insight_text = generate_sql_insight(
    user_question=biz_question,
    final_sql=result["final_sql"],  # from Module 8
    genbi_vectorstore=genbi_vectorstore,  # or None
    groq_client=client
)

print("Business Insight:\n")
print(insight_text)


Business Insight:

## Explanation:

This SQL query calculates the total revenue generated for each genre in a music store's catalog. 

It does this by:

1. **Joining multiple tables:** It combines information from the "Genre", "Track", and "InvoiceLine" tables to link genres with specific tracks and then with purchase information.
2. **Calculating revenue for each track:** It multiplies the unit price and quantity of each track sold to get the revenue for that individual sale.
3. **Summing revenue per genre:** It groups the results by genre and calculates the total revenue generated by tracks belonging to each genre.
4. **Ordering by revenue:** Finally, it orders the results in descending order based on the total revenue, showing the genres that generate the most revenue first.

Insight Summary:

This query generates a KPI called **Genre Revenue Breakdown**, which shows the total revenue contributions of each music genre.

Follow-up Questions:

- Which genres have the highest and lowes

Module 10

In [140]:
def extract_chart_type_from_text(text):
    if not text:
        return "auto"
    text = text.lower()
    if "bar" in text:
        return "bar"
    elif "line" in text or "trend" in text:
        return "line"
    elif "pie" in text:
        return "pie"
    elif "scatter" in text:
        return "scatter"
    else:
        return "auto"


In [141]:
import sqlite3
import pandas as pd
import plotly.express as px

def generate_chart_from_sql(final_sql, db_path="/content/Chinook_Sqlite.sqlite", chart_type="auto", chart_title=None):
    # Step 1: Run the SQL and load DataFrame
    try:
        conn = sqlite3.connect(db_path)
        df = pd.read_sql_query(final_sql, conn)
        conn.close()
    except Exception as e:
        print(f"Failed to execute SQL: {e}")
        return None, None

    # Step 2: Show DataFrame
    print("Data Preview:")
    display(df)

    if df.empty or df.shape[1] < 2:
        print("Not enough data to generate a chart.")
        return df, None

    # Step 3: Auto chart type selection
    if chart_type == "auto":
        if df.shape[1] == 2:
            chart_type = "bar"
        elif "date" in df.columns[0].lower() or "year" in df.columns[0].lower():
            chart_type = "line"
        else:
            chart_type = "bar"

    # Step 4: Generate chart using Plotly
    fig = None
    x = df.columns[0]
    y = df.columns[1]

    try:
        if chart_type == "bar":
            fig = px.bar(df, x=x, y=y, title=chart_title or f"{y} by {x}")
        elif chart_type == "line":
            fig = px.line(df, x=x, y=y, title=chart_title or f"{y} over {x}")
        elif chart_type == "pie":
            fig = px.pie(df, names=x, values=y, title=chart_title or f"{y} by {x}")
        elif chart_type == "scatter":
            fig = px.scatter(df, x=x, y=y, title=chart_title or f"{y} vs {x}")
        else:
            print(f"Unsupported chart type: {chart_type}")
    except Exception as e:
        print(f"Chart generation error: {e}")

    # Step 5: Show chart if available
    if fig:
        fig.show()

    return df, fig


In [142]:
chart_type = extract_chart_type_from_text(result.get("chart", ""))  # From Module 8/9

df, fig = generate_chart_from_sql(
    final_sql=result["final_sql"],
    db_path="/content/Chinook_Sqlite.sqlite",
    chart_type=chart_type,
    chart_title=biz_question
)


Data Preview:


Unnamed: 0,Genre,TotalRevenue
0,Rock,826.65
1,Latin,382.14
2,Metal,261.36
3,Alternative & Punk,241.56
4,TV Shows,93.53
5,Jazz,79.2
6,Blues,60.39
7,Drama,57.71
8,R&B/Soul,40.59
9,Classical,40.59


Full Pipeline

In [143]:
def run_full_pipeline(user_question, groq_client, table_vectorstore, column_vectorstore, genbi_vectorstore=None, db_path=db_path, model="gemma2-9b-it"):
    # Step 1: Retrieve schema chunks
    table_docs = table_vectorstore.as_retriever(search_kwargs={"k": 6}).get_relevant_documents(user_question)
    column_docs = column_vectorstore.as_retriever(search_kwargs={"k": 6}).get_relevant_documents(user_question)
    schema_chunks = list(set(doc.page_content for doc in (table_docs + column_docs)))

    # Step 2: Retrieve business rules (if available)
    if genbi_vectorstore:
        genbi_docs = genbi_vectorstore.as_retriever(search_kwargs={"k": 3}).get_relevant_documents(user_question)
        business_chunks = list(set(doc.page_content for doc in genbi_docs))
    else:
        business_chunks = ["<Business rules not available>"]

    # Step 3: Generate SQL (Module 6)
    user_prompt = get_sql_generation_prompt(
        user_question=user_question,
        schema_context="\n".join(schema_chunks),
        business_rules_context="\n".join(business_chunks),
        examples=[]
    )
    chat_completion = groq_client.chat.completions.create(
        messages=[
            {"role": "system", "content": get_system_prompt()},
            {"role": "user", "content": user_prompt}
        ],
        model=model
    )
    llm_response = chat_completion.choices[0].message.content
    parsed = parse_llm_response(llm_response)

    # Step 4: Validate and heal if needed (Modules 7 + 8)
    validation_result = validate_then_execute_with_optional_heal(
        sql_text=parsed["sql"],
        user_question=user_question,
        schema_chunks=schema_chunks,
        groq_client=groq_client,
        db_path=db_path,
        model=model
    )

    final_sql = validation_result["final_sql"]
    if not final_sql:
      return {
        "question": user_question,
        "final_sql": None,
        "was_healed": validation_result["was_healed"],
        "syntax_check_passed": False,
        "schema_check_passed": False,
        "explanation": validation_result.get("explanation"),
        "insight": None,
        "dataframe": None,
        "chart": None,
        "status": validation_result["status"]
    }

    # Step 5: Insight generation (Module 9)
    insight_text = generate_sql_insight(
        user_question=user_question,
        final_sql=final_sql,
        genbi_vectorstore=genbi_vectorstore,
        groq_client=groq_client
    )

    # Step 6: Chart type inference
    chart_type = extract_chart_type_from_text(validation_result.get("chart", ""))

    # Step 7: Visualization + DataFrame (Module 10)
    df, fig = generate_chart_from_sql(
        final_sql=final_sql,
        db_path=db_path,
        chart_type=chart_type,
        chart_title=user_question
    )

    # Final UI-ready result
    return {
        "question": user_question,
        "final_sql": final_sql,
        "was_healed": validation_result["was_healed"],
        "syntax_check_passed": True,
        "schema_check_passed": True,
        "explanation": validation_result.get("explanation"),
        "insight": insight_text,
        "dataframe": df,
        "chart": fig
    }


In [144]:
result = run_full_pipeline(
    user_question = biz_question,
    groq_client=client,
    table_vectorstore=table_vectorstore,
    column_vectorstore=column_vectorstore,
    genbi_vectorstore=genbi_vectorstore  # or None
)

# Example output usage
print("Question:", result["question"])
print("Final SQL:", result["final_sql"])
print("Healed:", result["was_healed"])
print(" Syntax Check:", result["syntax_check_passed"])
print("Schema Check:", result["schema_check_passed"])
print("explanation:", result["explanation"])
print("Insight:", result["insight"])
display(result["dataframe"])

 Validating SQL in Module 7:
SELECT g.Name AS Genre, SUM(il.UnitPrice * il.Quantity) AS TotalRevenue
FROM Genre g
JOIN Track t ON g.GenreId = t.GenreId
JOIN InvoiceLine il ON t.TrackId = il.TrackId
GROUP BY g.Name;
Syntax OK:
Schema OK:
Data Preview:


Unnamed: 0,Genre,TotalRevenue
0,Alternative,13.86
1,Alternative & Punk,241.56
2,Blues,60.39
3,Bossa Nova,14.85
4,Classical,40.59
5,Comedy,17.91
6,Drama,57.71
7,Easy Listening,9.9
8,Electronica/Dance,11.88
9,Heavy Metal,11.88


Question: What is the total revenue generated by each genre?
Final SQL: SELECT g.Name AS Genre, SUM(il.UnitPrice * il.Quantity) AS TotalRevenue
FROM Genre g
JOIN Track t ON g.GenreId = t.GenreId
JOIN InvoiceLine il ON t.TrackId = il.TrackId
GROUP BY g.Name;
Healed: False
 Syntax Check: True
Schema Check: True
explanation: None
Insight: ## Explanation:
This SQL query combines data from different tables to calculate the total revenue generated by each music genre. 

It joins information about genres (`Genre` table), tracks (`Track` table), and invoice lines (`InvoiceLine` table) that contain purchase details.  

The query sums up the revenue for each genre by multiplying the unit price and quantity of each track sold and then grouping the results by genre name.

## Insight Summary:
This query generates a report on the **revenue breakdown by music genre**.

## Follow-up Questions:
- **Which genre has the highest revenue and which has the lowest?** This will highlight the most and least pr

Unnamed: 0,Genre,TotalRevenue
0,Alternative,13.86
1,Alternative & Punk,241.56
2,Blues,60.39
3,Bossa Nova,14.85
4,Classical,40.59
5,Comedy,17.91
6,Drama,57.71
7,Easy Listening,9.9
8,Electronica/Dance,11.88
9,Heavy Metal,11.88


In [145]:
#!pip install -q gradio groq plotly


In [146]:
"""import gradio as gr

def gradio_bi_pipeline(user_question):
    result = run_full_pipeline(
        user_question=user_question,
        groq_client=client,
        table_vectorstore=table_vectorstore,
        column_vectorstore=column_vectorstore,
        genbi_vectorstore=genbi_vectorstore  # or None if not available
    )

    sql = result["final_sql"] or "❌ No valid SQL generated."
    explanation = result["explanation"] or "❌ No explanation provided."
    insight = result["insight"] or "❌ No business insight available."

    # Convert DataFrame to HTML
    df_html = result["dataframe"].to_html() if result["dataframe"] is not None else "❌ No data returned."

    return sql, explanation, insight, df_html, result["chart"]"""


'import gradio as gr\n\ndef gradio_bi_pipeline(user_question):\n    result = run_full_pipeline(\n        user_question=user_question,\n        groq_client=client,\n        table_vectorstore=table_vectorstore,\n        column_vectorstore=column_vectorstore,\n        genbi_vectorstore=genbi_vectorstore  # or None if not available\n    )\n\n    sql = result["final_sql"] or "❌ No valid SQL generated."\n    explanation = result["explanation"] or "❌ No explanation provided."\n    insight = result["insight"] or "❌ No business insight available."\n\n    # Convert DataFrame to HTML\n    df_html = result["dataframe"].to_html() if result["dataframe"] is not None else "❌ No data returned."\n\n    return sql, explanation, insight, df_html, result["chart"]'

In [147]:
"""gr.Interface(
    fn=gradio_bi_pipeline,
    inputs=gr.Textbox(label="Ask your business question"),
    outputs=[
        gr.Textbox(label="Final SQL"),
        gr.Textbox(label="SQL Explanation"),
        gr.Textbox(label="Business Insight"),
        gr.HTML(label="Query Result DataFrame"),
        gr.Plot(label="Chart")
    ],
    title="BI Assistant",
    description="Enter a question about your data. This assistant will generate SQL, explain it, and visualize the results."
).launch(debug=True, share=True)
"""

'gr.Interface(\n    fn=gradio_bi_pipeline,\n    inputs=gr.Textbox(label="Ask your business question"),\n    outputs=[\n        gr.Textbox(label="Final SQL"),\n        gr.Textbox(label="SQL Explanation"),\n        gr.Textbox(label="Business Insight"),\n        gr.HTML(label="Query Result DataFrame"),\n        gr.Plot(label="Chart")\n    ],\n    title="BI Assistant",\n    description="Enter a question about your data. This assistant will generate SQL, explain it, and visualize the results."\n).launch(debug=True, share=True)\n'