In [2]:
!pip install langchain transformers faiss-cpu sentence-transformers pandas langchain-community
# If using OpenAI or another API-based LLM:
!pip install openai

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-se

In [4]:
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
import transformers
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter

# 1. Load and prepare data with chunking
df = pd.read_csv("Training Dataset.csv")

# Convert each row to text and chunk them
text_splitter = CharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    separator="\n"
)

# Prepare knowledge base with chunking
documents = []
for _, row in df.iterrows():
    text = str(row.to_dict())
    docs = text_splitter.split_text(text)
    documents.extend(docs)

# 2. Initialize embeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'}  # Explicitly set to CPU
)

# 3. Create vector store
vector_store = FAISS.from_texts(documents, embeddings)
vector_store.save_local("loan_approval_faiss_index")

# 4. Initialize LLM with proper tokenizer settings
llm_model = "google/flan-t5-small"
tokenizer = transformers.AutoTokenizer.from_pretrained(
    llm_model,
    model_max_length=512  # Set max length explicitly
)
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(llm_model)
pipe = transformers.pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256  # Limit output length
)
llm = HuggingFacePipeline(pipeline=pipe)

# 5. Create QA chain with updated LangChain syntax
prompt_template = """Answer the question based on the loan approval data.
Focus on the most relevant factors and provide a concise answer.

Context: {context}

Question: {question}
Answer in 2-3 sentences:"""
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(
        search_kwargs={"k": 2}  # Retrieve fewer documents
    ),
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=False
)

# 6. Updated query function
def ask_question(question):
    try:
        # Use the new recommended invoke() method
        result = qa_chain.invoke({"query": question})
        return result["result"]
    except Exception as e:
        return f"Error processing question: {str(e)}"

# Example usage
questions = [
    "What factors typically lead to loan approval?",
    "How does credit history affect loan approval?",
    "What income level is usually required?",
    # Basic Loan Approval Questions
    "What are the main requirements for getting a loan approved?",
    "How important is employment status for loan approval?",
    "Does age affect loan approval decisions?",
    "What documentation is typically required for loan approval?",

    #Financial Factor Questions
    "How does debt-to-income ratio impact loan approval?",
    "What is the minimum credit score needed for loan approval?",
    "How do lenders evaluate income for loan approval?",
    "Does having savings affect loan approval chances?",

      # Demographic Factor Questions
    "How does marital status affect loan approval rates?",
    "Do loan approval rates differ between male and female applicants?",
    "How does the number of dependents influence loan approval?",
    "Are there differences in approval rates between urban and rural applicants?",

    # Loan-Specific Questions
    "What loan amount ranges are most commonly approved?",
    "How does loan term length affect approval chances?",
    "Are secured loans easier to get approved than unsecured loans?",
    "What types of collateral improve loan approval odds?",
]

for q in questions:
    print(f"Q: {q}")
    print(f"A: {ask_question(q)}\n")

  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


Q: What factors typically lead to loan approval?
A: Loan_Amount_Term: 360.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_Term: 1.0, Loan_Amount_

Q: How does credit history affect loan approval?
A: Credit_History affects loan approval

Q: What income level is usually required?
A: 113.0

Q: What are the main requirements for getting a loan approved?
A: Loans are required to be approved by the lender.

Q: How important is employment status for loan approval?
A: Y

Q: Does age affect loan approval dec