# **Prajwal Srivastava**

Week 8 Assignment for Celebal Technologies

In [7]:
!pip install -q pandas faiss-cpu sentence-transformers transformers torch

In [9]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load your dataset
df = pd.read_csv("/content/Training Dataset.csv")
df.fillna("", inplace=True)

# Convert each row to a text document
def row_to_text(row):
    return (
        f"The applicant is a {row['Gender']} who is a {row['Education']} and "
        f"{'is self-employed' if row['Self_Employed'] == 'Yes' else 'is not self-employed'}. "
        f"They have an income of {row['ApplicantIncome']}, a coapplicant income of {row['CoapplicantIncome']}, "
        f"a loan amount of {row['LoanAmount']} and a loan term of {row['Loan_Amount_Term']}. "
        f"The credit history is {row['Credit_History']} and the property area is {row['Property_Area']}. "
        f"The loan status is {row['Loan_Status']}."
    )

docs = df.apply(row_to_text, axis=1).tolist()
print(f"Processed {len(docs)} documents.")

# Embed with SentenceTransformer
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Embedding documents")
doc_embeddings = embed_model.encode(docs, show_progress_bar=True)

dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

# Load Flan-T5 base model
print("Loading language model")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
generator_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# Retrieval and generation functions
def retrieve_docs(query, k=5):
    query_vec = embed_model.encode([query])
    distances, indices = index.search(query_vec, k)
    return [docs[i] for i in indices[0]]

def generate_answer(context, question):
    prompt = (
        f"You are an assistant for analyzing loan applications.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\n"
        f"Answer the question based only on the context above."
    )
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = generator_model.generate(
        **inputs,
        max_new_tokens=100,
        repetition_penalty=2.0,
        no_repeat_ngram_size=3
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def answer_question(question, top_k=5):
    context_docs = retrieve_docs(question, k=top_k)
    context = "\n".join(context_docs)
    return generate_answer(context, question)

# Main loop to replicate example behavior
while True:
    question = input("\nAsk a question (or type 'exit' to quit): ")
    if question.lower() == 'exit':
        break
    answer = answer_question(question)
    print(f"\nAnswer: {answer}")

  df.fillna("", inplace=True)


Processed 614 documents.
Embedding documents


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Loading language model


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Ask a question (or type 'exit' to quit): Do graduates have a higher chance of loan approval?

Answer: no

Ask a question (or type 'exit' to quit): How does credit history affect loan approval?

Answer: The credit history is 1.0 and the property area is Urban. The loan status is N

Ask a question (or type 'exit' to quit): How does income level influence the decision?

Answer: The applicant is a Male who is 

Ask a question (or type 'exit' to quit): exit
