In [3]:
!pip install sentence-transformers scikit-learn gradio --q

In [4]:

import sentence_transformers
import sklearn
import gradio as gr

print("All dependencies installed successfully!")
print(f"   - sentence-transformers: {sentence_transformers.__version__}")
print(f"   - scikit-learn: {sklearn.__version__}")
print(f"   - gradio: {gr.__version__}")

All dependencies installed successfully!
   - sentence-transformers: 5.2.2
   - scikit-learn: 1.6.1
   - gradio: 5.50.0


In [5]:
from google.colab import files
import json

print("Upload your bfsi_dataset_improved.json file...")
uploaded = files.upload()

if 'bfsi_dataset_improved.json' in uploaded:
    print("\n Dataset uploaded successfully!")


    with open('bfsi_dataset_improved.json', 'r') as f:
        dataset = json.load(f)
    print(f" - Total samples: {len(dataset)}")
    print(f"   - Format: Valid JSON ✓")
else:
    print(" Please upload the dataset file named 'bfsi_dataset_improved.json'")

Upload your bfsi_dataset_improved.json file...


Saving bfsi_dataset_improved.json to bfsi_dataset_improved.json

 Dataset uploaded successfully!
 - Total samples: 103
   - Format: Valid JSON ✓


In [6]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Optional, List


class DatasetMatcher:
    """
    Tier 1: Dataset Similarity Matching
    Returns pre-defined responses for known queries
    """

    def __init__(self, dataset_path='bfsi_dataset_improved.json'):
        print(" Initializing Dataset Matcher (Tier 1)...")

        with open(dataset_path, 'r', encoding='utf-8') as f:
            self.dataset = json.load(f)
        print(f"   ✓ Loaded {len(self.dataset)} conversation samples")

        print("   ✓ Loading sentence transformer model...")
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

        self.dataset_instructions = [item['instruction'] for item in self.dataset]
        print("   ✓ Computing embeddings (this takes ~1 minute)...")
        self.dataset_embeddings = self.model.encode(
            self.dataset_instructions,
            show_progress_bar=True,
            batch_size=32
        )

        print(f"\n Dataset Matcher ready!\n")

    def find_match(self, user_query: str, threshold: float = 0.85) -> Optional[Dict]:
        """
        Find best matching response from dataset

        Args:
            user_query: User's question
            threshold: Minimum similarity (0.85 recommended)

        Returns:
            Response dict if match found, None otherwise
        """
        query_embedding = self.model.encode([user_query])

        similarities = cosine_similarity(query_embedding, self.dataset_embeddings)[0]

        best_match_idx = np.argmax(similarities)
        best_similarity = float(similarities[best_match_idx])

        if best_similarity >= threshold:
            matched_item = self.dataset[best_match_idx]

            return {
                'response': matched_item['output'],
                'similarity': best_similarity,
                'matched_question': matched_item['instruction'],
                'source': 'dataset',
                'tier': 1,
                'confidence': 'high' if best_similarity > 0.9 else 'medium'
            }

        return None

    def find_top_k_matches(self, user_query: str, k: int = 3) -> List[Dict]:
        """Find top K similar questions"""
        query_embedding = self.model.encode([user_query])
        similarities = cosine_similarity(query_embedding, self.dataset_embeddings)[0]

        top_k_indices = np.argsort(similarities)[-k:][::-1]

        results = []
        for idx in top_k_indices:
            results.append({
                'question': self.dataset_instructions[idx],
                'similarity': float(similarities[idx]),
                'answer_preview': self.dataset[idx]['output'][:100] + '...'
            })

        return results


class SafetyGuardrails:
    """Safety checks for queries"""

    def __init__(self):
        self.out_of_domain_patterns = [
            'weather', 'restaurant', 'recipe', 'poem', 'story', 'joke',
            'movie', 'game', 'sports', 'homework', 'music', 'celebrity'
        ]

        self.privacy_patterns = [
            'account number', 'balance', 'transaction', 'customer data',
            'personal information', 'credit card', 'password', 'pin'
        ]

    def check_query(self, user_query: str) -> tuple:
        """Check if query is safe and in-domain"""
        query_lower = user_query.lower()


        for pattern in self.out_of_domain_patterns:
            if pattern in query_lower:
                return False, (
                    "I apologize, but I can only assist with banking, financial services, "
                    "loan, and insurance-related queries. For other topics, please use "
                    "appropriate specialized services."
                )


        for pattern in self.privacy_patterns:
            if pattern in query_lower and any(
                word in query_lower for word in ['tell', 'show', 'give', 'what is']
            ):
                return False, (
                    "I cannot access or provide customer account information or personal data "
                    "due to strict privacy regulations. Please log into your account or "
                    "contact customer support with proper verification."
                )

        return True, None


class BFSIAssistant:
    """Main BFSI Assistant"""

    def __init__(self):
        print("="*60)
        print(" BFSI Call Center AI Assistant")
        print("="*60 + "\n")

        self.safety = SafetyGuardrails()
        self.matcher = DatasetMatcher()

        print("="*60)
        print("Assistant Ready!")
        print("="*60 + "\n")

    def get_response(self, user_query: str, threshold: float = 0.85) -> Dict:
        """Get response for user query"""


        is_safe, rejection_msg = self.safety.check_query(user_query)

        if not is_safe:
            return {
                'response': rejection_msg,
                'tier': 0,
                'source': 'safety_guardrails',
                'confidence': 'high'
            }


        result = self.matcher.find_match(user_query, threshold)

        if result:
            return result


        return {
            'response': (
                "I apologize, but I don't have a confident answer for your specific question. "
                "For accurate information, please contact our customer support at 1800-XXX-XXXX "
                "or email support@lendkraft.ai"
            ),
            'tier': None,
            'source': 'fallback',
            'confidence': 'low'
        }


print("All classes defined successfully!")

All classes defined successfully!


In [7]:
assistant = BFSIAssistant()

print("\nReady to answer queries!")

 BFSI Call Center AI Assistant

 Initializing Dataset Matcher (Tier 1)...
   ✓ Loaded 103 conversation samples
   ✓ Loading sentence transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

   ✓ Computing embeddings (this takes ~1 minute)...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]


 Dataset Matcher ready!

Assistant Ready!


Ready to answer queries!


In [8]:
test_queries = [
    "How can I check my loan eligibility?",
    "What is EMI?",
    "Can I prepay my loan?",
    "What's the weather today?",
    "Tell me account balance for 123456",
]

print("\n" + "="*70)
print("TESTING BFSI ASSISTANT")
print("="*70 + "\n")

for i, query in enumerate(test_queries, 1):
    print(f"\n{'─'*70}")
    print(f"Test {i}/{len(test_queries)}")
    print(f"Query: {query}")
    print('─'*70)

    result = assistant.get_response(query)

    print(f"\n Metadata:")
    print(f"   Tier: {result.get('tier', 'N/A')}")
    print(f"   Source: {result.get('source', 'N/A')}")
    print(f"   Confidence: {result.get('confidence', 'N/A')}")
    if 'similarity' in result:
        print(f"   Similarity: {result['similarity']:.3f}")

    print(f"\n Response:")
    response_preview = result['response'][:300]
    print(f"   {response_preview}...\n")

print("\n" + "="*70)
print("Testing Complete!")
print("="*70)


TESTING BFSI ASSISTANT


──────────────────────────────────────────────────────────────────────
Test 1/5
Query: How can I check my loan eligibility?
──────────────────────────────────────────────────────────────────────

 Metadata:
   Tier: 1
   Source: dataset
   Confidence: high
   Similarity: 1.000

 Response:
   You can check your loan eligibility by logging into your account and navigating to the 'Loan Eligibility' section. You'll need to provide: PAN card, Aadhaar details, employment information, and monthly income details. The system performs a soft credit check that does not impact your credit score. Re...


──────────────────────────────────────────────────────────────────────
Test 2/5
Query: What is EMI?
──────────────────────────────────────────────────────────────────────

 Metadata:
   Tier: 1
   Source: dataset
   Confidence: high
   Similarity: 1.000

 Response:
   EMI stands for Equated Monthly Installment. It is a fixed payment amount made by a borrower to the lender 

In [None]:
query = "How can I improve my loan eligibility?"
print(f"\n{'='*70}")
print(f"Query: {query}")
print('='*70)

result = assistant.get_response(query)

print(f"\n Metadata:")
print(f"   Tier: {result.get('tier', 'N/A')}")
print(f"   Source: {result.get('source', 'N/A')}")
print(f"   Confidence: {result.get('confidence', 'N/A')}")
if 'similarity' in result:
    print(f"   Similarity Score: {result['similarity']:.4f}")
if 'matched_question' in result:
    print(f"   Matched Question: {result['matched_question']}")

print(f"\n Response:")
print(f"\n{result['response']}")
print(f"\n{'='*70}\n")


Query: How can I improve my loan eligibility?

 Metadata:
   Tier: 1
   Source: dataset
   Confidence: high
   Similarity Score: 1.0000
   Matched Question: How can I improve my loan eligibility?

 Response:

To improve your loan eligibility: (1) Maintain a credit score above 750 by paying all EMIs and credit card bills on time, (2) Reduce existing debt - aim for debt-to-income ratio below 40%, (3) Increase your income through salary increments or additional income sources, (4) Maintain stable employment for at least 2 years, (5) Avoid multiple loan applications within short periods, (6) Keep bank account statements healthy with regular credits, and (7) Clear any pending defaults or settlements. Improvement typically takes 3-6 months of consistent financial discipline.




In [None]:
query = "Tell me about loan rates"

print(f"\nQuery: {query}")
print(f"\nTop 5 Similar Questions:\n")
print("="*70)

top_matches = assistant.matcher.find_top_k_matches(query, k=5)

for i, match in enumerate(top_matches, 1):
    print(f"\n{i}. {match['question']}")
    print(f"   Similarity: {match['similarity']:.4f}")
    print(f"   Answer preview: {match['answer_preview']}")

print(f"\n{'='*70}\n")


Query: Tell me about loan rates

Top 5 Similar Questions:


1. What interest rate will I get on my loan?
   Similarity: 0.7401
   Answer preview: Interest rates are personalized based on multiple factors: (1) Credit score: 750+ → 10.5-12% p.a., 7...

2. How is the interest rate determined for my loan?
   Similarity: 0.7071
   Answer preview: Your interest rate is calculated using a risk-based pricing model: Step 1 - Base Rate: Starts with o...

3. What is the loan interest rate for account 987654321?
   Similarity: 0.6639
   Answer preview: I cannot access specific account details or customer information due to data privacy and security po...

4. What is repo rate and how does it affect my loan?
   Similarity: 0.5772
   Answer preview: RBI Repo Rate is the rate at which RBI lends to commercial banks. Current rate: 6.50% (as of Decembe...

5. Can my interest rate change during the loan tenure?
   Similarity: 0.5670
   Answer preview: For floating rate loans: Yes, interest rate changes 

In [12]:
import gradio as gr


def chat(message, history):
    result = assistant.get_response(message)
    return result['response']

demo = gr.ChatInterface(
    fn=chat,
    examples=[
        "How can I check my loan eligibility?",
        "What is EMI?",
        "Can I prepay my loan?",
        "What documents are required?",
        "How to improve credit score?",
    ],
    title=" BFSI Call Center AI Assistant",
    description="Ask questions about loans, EMI, eligibility, and more!"
)

print("Launching demo...")
demo.launch(share=True, debug=True)
print("Demo ready! Click the URL above.")



  self.chatbot = Chatbot(


Launching demo...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://da14e83bde6176d755.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://da14e83bde6176d755.gradio.live
Demo ready! Click the URL above.
