In [1]:
# ✅ STEP 1: Install Required Libraries
!pip install -q faiss-cpu sentence-transformers transformers

# ✅ STEP 2: Upload ONLY "Training Dataset.csv"
from google.colab import files
uploaded = files.upload()

import pandas as pd

file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

# ✅ STEP 3: Prepare Document Chunks from Data
docs = df.astype(str).apply(lambda row: ' | '.join(row), axis=1).tolist()

# ✅ STEP 4: Create Embeddings and Index with FAISS
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embed_model = SentenceTransformer("all-MiniLM-L6-v2")
doc_embeddings = embed_model.encode(docs, convert_to_numpy=True)

dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

# ✅ STEP 5: Load FREE PUBLIC LLM from HuggingFace (no login needed)
from transformers import pipeline

qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

# ✅ STEP 6: Define the RAG-based Q&A Function
def rag_chatbot(question, top_k=5):
    q_embed = embed_model.encode([question])
    _, indices = index.search(q_embed, top_k)
    retrieved_docs = [docs[i] for i in indices[0]]
    context = "\n".join(retrieved_docs)

    prompt = f"Answer the question based on the context:\n\n{context}\n\nQuestion: {question}"
    result = qa_pipeline(prompt, max_new_tokens=150)
    return result[0]["generated_text"].strip()

# ✅ STEP 7: Ask Questions in a Loop
while True:
    question = input("\n💬 Ask a question (type 'exit' to quit): ")
    if question.lower() == 'exit':
        break
    answer = rag_chatbot(question)
    print("🤖 Answer:", answer)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Saving Training Dataset.csv to Training Dataset.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu



💬 Ask a question (type 'exit' to quit): what is the most common loan status
🤖 Answer: nan

💬 Ask a question (type 'exit' to quit): "what is the most common loan status?"
🤖 Answer: Not Graduate

💬 Ask a question (type 'exit' to quit): "Tell me about loan status trends"
🤖 Answer: LP001050 | nan | Yes | 2 | Not Graduate | No | 3572 | 4114.0 | 152.0 | nan | 0.0 | Rural | N LP001641 | Male | Yes | 1 | Graduate | Yes | 2178 | 0.0 | 66.0 | 300.0 | 0.0 | Rural | N LP001606 | Male | Yes | 0 | Graduate | No | 3497 | 1964.0 | 116.0 | 360.0 | 1.0 | Rural | N

💬 Ask a question (type 'exit' to quit): "Which loan status is most frequent?"
🤖 Answer: Not Graduate

💬 Ask a question (type 'exit' to quit): "What is the minimum loan amount?"
🤖 Answer: 12500

💬 Ask a question (type 'exit' to quit): "Are there more self-employed applicants or salaried ones?"
🤖 Answer: Self-employed

💬 Ask a question (type 'exit' to quit): quit
🤖 Answer: LP002335 | Female | Yes | 0 | Not Graduate | No | 2149 | 3237.0 | 178.0