In [1]:
!pip install transformers datasets torch



In [2]:
import json
data = json.load(open("y.json"))

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
!pip install faiss-gpu-cu12



In [5]:
import faiss
import numpy as np

gpu_res = faiss.StandardGpuResources()
gpu_res

<faiss.swigfaiss.StandardGpuResources; proxy of <Swig Object of type 'faiss::gpu::StandardGpuResources *' at 0x79d3182410e0> >

In [6]:
!pip install sentence-transformers



In [7]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
texts = []
metadata = []
for entry in data:
  card_name = entry["card_name"]
  section_type = entry["section_type"]
  details = entry["details"]

  for detail in details:
    if "question" in detail and "answer" in detail:
      text = f"{card_name} - {section_type}: Question: {detail['question']} Answer: {detail['answer']}"
      texts.append(text)
      metadata.append((card_name,section_type,text))
    else:
      if isinstance(detail["value"], list):
        for nested_detail in detail["value"]:
          text = f"{card_name} - {section_type}: {detail['type']} - {nested_detail['type']} : {nested_detail['value']}"
          texts.append(text)
          metadata.append((card_name, section_type, text))
      else:
        text = f"{card_name} - {section_type}: {detail['type']} : {detail['value']}"
        texts.append(text)
        metadata.append((card_name, section_type, text))

In [9]:
embeddings = embedding_model.encode(texts, convert_to_numpy=True)

In [10]:
dimension = embeddings.shape[1]
dimension

384

In [11]:
faiss_index = faiss.GpuIndexFlatL2(gpu_res, dimension)

In [12]:
faiss_index.add(embeddings)

In [13]:
def handle_query_faiss(query, card_name_filter=None, top_k=3):
  query_embedding = embedding_model.encode([query], convert_to_numpy=True)
  distances, indices = faiss_index.search(query_embedding, top_k)

  results = []
  for dist, idx in zip(distances[0], indices[0]):
    card_name, section_type, text = metadata[idx]
    if card_name_filter and card_name != card_name_filter:
      continue
    results.append({
        "distance": dist,
        "card_name": card_name,
        "section_type": section_type,
        "text": text
    })
  return results

The below model hashtag7781/bert-finetuned-squad has been finetuned by us on SQUAD dataset.

In [14]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model="hashtag7781/bert-finetuned-squad")

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at hashtag7781/bert-finetuned-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.
Device set to use 0


In [15]:
def generate_answer(query, identified_card=None, top_k=3):
  retrieved_contexts = handle_query_faiss(query, identified_card, top_k)
  context_texts = [context["text"] for context in retrieved_contexts]
  answer = question_answerer(question=query, context=' '.join(context_texts))['answer']
  # inputs = tokenizer(combined_input, return_tensors='pt').to(device)
  # outputs = model.generate(
  #     inputs.input_ids,
  #     max_length=512,
  #     num_return_sequences=1,
  #     no_repeat_ngram_size=2,
  #     top_p=0.9,
  #     temperature=0.4,
  #     do_sample=True,
  #     pad_token_id=tokenizer.eos_token_id,
  #     eos_token_id=tokenizer.eos_token_id
  # )
  # # outputs = model.generate(**inputs, max_length=512, num_beams=3)
  # answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
  # if "Helpful Answer" in answer:
  #   answer = answer.split("Helpful Answer:")[1].strip()
  # else:
  #   answer = "No answer found"
  return answer

In [16]:
import re

card_names = [
    "Active Cash Visa Card",
    "Autograph Journey Visa Card",
    "Autograph Visa Card",
    "Reflect Visa Card",
    "Attune World Elite Mastercard",
    "Bilt World Elite Mastercard",
    "One Key Mastercard",
    "One Key+ Mastercard",
    "Choices Privileges World Elite Mastercard",
    "Choices Privileges Select World Elite Mastercard",
    "Signify Business Cash Card"
]

def identify_card_from_query(query):
    for card in card_names:
        if re.search(rf"\b{re.escape(card)}\b", query, re.IGNORECASE):
            return card
    return None

def handle_user_query(query, top_k=3):
    identified_card = identify_card_from_query(query)

    if identified_card:
      print(f"Card identified: {identified_card}\nProceeding with your query...")
      return generate_answer(query, identified_card, top_k)
    else:
        card_list = "\n".join(card_names)
        print(f"Could not identify the card. Please mention the card name along with your query again.\nHere is a list of available card names:\n{card_list}")

In [17]:
query = "What is the minimum credit card limit of Active Cash visa card?"
answer = handle_user_query(query, 1)
answer

Card identified: Active Cash Visa Card
Proceeding with your query...


'$1,000.00'

In [18]:
query2 = "What is the annual fee of the Active Cash Visa card?"
answer2 = handle_user_query(query2, 1)
answer2

Card identified: Active Cash Visa Card
Proceeding with your query...


'None'

In [19]:
query3 = "How do you review my application for the Active Cash Visa card?"
answer3 = handle_user_query(query3, 1)
answer3

Card identified: Active Cash Visa Card
Proceeding with your query...


'You agree that we can review your criteria'

In [20]:
query4 = "How are variable APRs calculated for Active Cash visa card?"
answer4 = handle_user_query(query4, 1)
answer4

Card identified: Active Cash Visa Card
Proceeding with your query...


'we use the U.S. Prime Rate'