In [None]:
! pip install langchain langchain-community langchain-openai sentence-transformers faiss-cpu pandas

In [None]:
from langchain_community.document_loaders import CSVLoader, TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader

# Choose the loader based on your file type:
loader = CSVLoader(r"c:\DATA SCIENCE\Machine_Learning\Datasets\bank.csv")  # For CSV
# loader = TextLoader("path/to/file.txt")
# loader = PyPDFLoader("path/to/file.pdf")
# loader = UnstructuredWordDocumentLoader("path/to/file.docx")

docs = loader.load()
print(f"Loaded {len(docs)} documents")


In [None]:
! pip install langchain-text-splitters


In [None]:
# preprocess.py
import pandas as pd

SENSITIVE_COLS = ["name","phone","email","address","account_number","ssn","pan","ifsc"]
HIGH_RISK_COLS = ["password_hint","security_question","raw_notes"]

def load_and_sanitize(csv_path: str, role: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    # Drop high-risk columns entirely
    for c in HIGH_RISK_COLS:
        if c in df.columns: df.drop(columns=[c], inplace=True)
    # Redact PII by default
    for c in SENSITIVE_COLS:
        if c in df.columns:
            if role in ["support","analyst","exec","guest"]:  # all roles redact raw PII
                df[c] = "[REDACTED]"
    # Role-specific reductions
    if role == "guest":
        keep = [c for c in df.columns if c not in SENSITIVE_COLS]  # minimal columns
        df = df[keep]
    elif role == "analyst":
        # Aggregate view example: keep numeric for stats
        pass  # Keep as needed, avoid raw identifiers
    return df

def to_documents(df: pd.DataFrame, chunk_cols=None):
    # Convert each row to a text document for indexing; include only allowed columns
    if chunk_cols is None:
        chunk_cols = df.columns.tolist()
    docs = []
    for _, row in df.iterrows():
        text = "\n".join(f"{c}: {row[c]}" for c in chunk_cols)
        docs.append(text)
    return docs


In [None]:
SYSTEM_PROMPT = """
You are a Secure RAG assistant for a bank dataset. You must:
- Enforce role-based access. Only answer allowed intents for the user's role.
- Never reveal personally identifiable information (PII) or credentials.
- Prefer aggregates, anonymized insights, and generalized guidance.
- If a request is sensitive or disallowed, refuse with a brief policy-based explanation.
- Verify outputs: if any token resembles PII or sensitive data, replace with [REDACTED] or refuse.

When unsure, err on the side of privacy. Provide citations to retrieved chunks only if they are non-sensitive.
"""


In [None]:
SYSTEM_PROMPT = """
You are a data safety–first financial assistant. You MUST enforce strict privacy controls.
Never reveal the following sensitive details verbatim, even if explicitly asked:
- Full account numbers
- Phone numbers
- Email addresses
- Exact balances
- Exact salaries
- Exact credit scores
- Full identity details (addresses, PAN/Aadhaar, DOB)

Safety rules:
1) If a user asks for any restricted field, POLITELY REFUSE and explain the restriction.
2) Offer safe alternatives: high-level summaries, aggregates, trends, anonymized stats, or masked values.
3) Masking format:
   - Account numbers: show only last 4 digits (e.g., ****-****-****-3456)
   - Phone numbers: mask middle digits (e.g., 98****3210)
   - Email: mask local part (e.g., p*****t@example.com)
   - Balances/salaries/credit scores: provide ranges or percentile buckets (e.g., balance is between 250k–315k)
4) When answering allowed analytics (spending trends, category totals, monthly aggregates), ensure NO leakage of restricted fields.
5) If a query mixes allowed and restricted parts, refuse the restricted parts but still provide safe analytics for the rest.
6) If the retriever returns snippets containing sensitive fields, treat them as confidential and follow the masking and refusal rules.
7) Never provide raw source text that includes sensitive data. Summarize safely.

Respond concisely and ensure every output complies with these rules.
"""


In [None]:
INTENT_PROMPT = """
Classify the user query into one of:
- general_info, product_faq, troubleshooting, aggregate_stats, trend_analysis, anonymized_insights,
- identity_lookup, account_specific, personal_contact, credentials, ssn_pan_lookup, account_number_reveal.

Return JSON: {"intent": "...", "sensitive": true/false, "reason": "..."}.
Be conservative: if unsure, mark sensitive=true.
"""


In [None]:
! pip install policy


In [None]:
# filters.py
import re
import policy
from policy import SECURITY_POLICY

def scrub_sensitive(text: str) -> str:
    out = text
    for pat in SECURITY_POLICY["sensitive_patterns"]:
        out = re.sub(pat, "[REDACTED]", out)
    return out

def is_disallowed_intent(role: str, intent: str) -> bool:
    role_pol = SECURITY_POLICY["roles"].get(role, {})
    return intent in role_pol.get("deny_intents", [])


In [None]:
# app.py
import os, json
import pandas as pd
from preprocess import load_and_sanitize, to_documents
from filters import scrub_sensitive, is_disallowed_intent
from policy import SECURITY_POLICY, SECURITY_POLICY
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings  # or AzureOpenAIEmbeddings, or local (e.g., sentence-transformers via local runtime)
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI   # or AzureChatOpenAI, or Ollama via ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain.callbacks.base import BaseCallbackHandler

ROLE = os.environ.get("USER_ROLE", "analyst")  # guest/support/analyst/exec

# 1) Load and sanitize
csv_path = "c:\DATA SCIENCE\Machine_Learning\Datasets\bank.csv"  # Provide path accessible to your runtime
df = load_and_sanitize(csv_path, role=ROLE)
docs_text = to_documents(df)

# 2) Split and index
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=100)
chunks = []
for t in docs_text:
    for c in splitter.split_text(t):
        chunks.append(Document(page_content=c, metadata={"role_view": ROLE, "source": "bank"}))

# 3) Embeddings & Vector store
# Option A: OpenAI
emb = OpenAIEmbeddings(model="text-embedding-3-large")
vs = FAISS.from_documents(chunks, emb)

# 4) Models
llm = ChatOpenAI(model="gpt-4o", temperature=0)

intent_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a conservative security classifier."),
    ("human", "{query}\n" + INTENT_PROMPT)
])

guarded_prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("system", "User role: {role}\nIntent: {intent}\nSensitive: {sensitive}\nReason: {reason}"),
    ("system", "Retrieved Context (sanitized):\n{context}"),
    ("human", "{query}\nAnswer with strict compliance to policy.")
])

# 5) Query function with policy enforcement
def answer_query(query: str, k: int = 3):
    # Classify intent
    intent_chain = intent_prompt | llm
    intent_raw = intent_chain.invoke({"query": query}).content
    try:
        meta = json.loads(intent_raw)
    except:
        meta = {"intent": "general_info", "sensitive": True, "reason": "Parse fail; conservative default."}

    if is_disallowed_intent(ROLE, meta["intent"]) or meta.get("sensitive", True):
        return f"Request denied due to privacy policy. Intent '{meta['intent']}' is not permitted for role '{ROLE}'."

    # Retrieve minimal context
    retrieved = vs.similarity_search(query, k=k)
    sanitized_context = "\n---\n".join(scrub_sensitive(doc.page_content) for doc in retrieved)

    # Compose guarded response
    chain = guarded_prompt | llm
    resp = chain.invoke({
        "role": ROLE,
        "intent": meta["intent"],
        "sensitive": meta["sensitive"],
        "reason": meta["reason"],
        "context": sanitized_context,
        "query": query
    }).content

    # Final scrub just in case
    resp = scrub_sensitive(resp)
    return resp

# Optional: simple audit logger
def audit_log(query, role, decision, intent, reason):
    print(json.dumps({
        "event": "audit",
        "role": role,
        "decision": decision,
        "intent": intent,
        "reason": reason
    }))

# Example usage
if __name__ == "__main__":
    q = "Show average balance by product segment without any names."
    ans = answer_query(q)
    print(ans)
