In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
pip install pandas scikit-learn gradio sentence-transformers faiss-cpu transformers



In [28]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("/content/drive/MyDrive/ragloanbot/Training Dataset.csv")

df_clean = df.dropna().copy()

label_encoders = {}
for col in df_clean.select_dtypes(include="object").columns:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

X = df_clean.drop("Loan_Status", axis=1)
y = df_clean["Loan_Status"]


In [52]:
df_clean.to_csv("cleaned_loan_data.csv", index=False)

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

#sorted feature importances
feature_importances = sorted(zip(X.columns, clf.feature_importances_), key=lambda x: -x[1])
print("Top Features:\n")
for feat, score in feature_importances:
    print(f"{feat}: {score:.3f}")

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.59      0.67        27
           1       0.85      0.93      0.89        69

    accuracy                           0.83        96
   macro avg       0.81      0.76      0.78        96
weighted avg       0.83      0.83      0.83        96

Top Features:

Credit_History: 0.228
ApplicantIncome: 0.161
LoanAmount: 0.154
Loan_ID: 0.152
CoapplicantIncome: 0.091
Loan_Amount_Term: 0.054
Property_Area: 0.043
Dependents: 0.040
Married: 0.023
Gender: 0.019
Education: 0.018
Self_Employed: 0.015


In [46]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import torch
import numpy as np

#Sentence embeddings for retrieval
embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

#Load LLM
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
llm = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

#Convert each row into a string document
def row_to_text(row):
    return " | ".join([f"{col}: {row[col]}" for col in df_clean.columns])

documents = df_clean.apply(row_to_text, axis=1).tolist()

#Encode documents and build FAISS index
doc_embeddings = embedder.encode(documents, convert_to_numpy=True)
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

Device set to use cuda:0


In [49]:
#RAG
def retrieve_docs(query, k=2):
    query_vec = embedder.encode([query])
    D, I = index.search(np.array(query_vec), k)
    return [documents[i] for i in I[0]]

#QA generation function
def generate_answer(query, context_docs):
    context = "\n".join(context_docs[:2])

    #Basic hardcoded logic for popular questions
    if "loan amount" in query.lower():
        avg_loan = df_clean["LoanAmount"].mean()
        return f"The typical loan amount is around {round(avg_loan, 2)}."

    elif "credit history" in query.lower():
        approved = df_clean[df_clean["Loan_Status"] == 1]
        has_credit_history = approved["Credit_History"].mean()
        return "Yes, credit history strongly affects loan approval." if has_credit_history > 0.7 else "Credit history has some influence on loan approval."

    elif "gender" in query.lower() or "education" in query.lower():
        counts = df_clean[df_clean["Loan_Status"] == 1].groupby(["Gender", "Education"]).size()
        return f"Approved applicants are mostly from these Gender/Education groups:\n{counts}"

    #Else use LLM
    prompt = f"""
You are a data analyst. Use the dataset info to answer the question.

Dataset snippet:
{context}

Question: {query}
Answer:"""

    result = llm(prompt, max_new_tokens=256)[0]["generated_text"]
    return result.strip()

In [55]:
while True:
    query = input("Enter your question about the loan dataset (or type 'exit'): ")
    if query.lower() in ["exit", "quit"]:
        break
    docs = retrieve_docs(query)
    answer = generate_answer(query, docs)
    print("\nAnswer:\n", answer)

Enter your question about the loan dataset (or type 'exit'): what is the typical loan amount

Answer:
 The typical loan amount is around 144.74.
Enter your question about the loan dataset (or type 'exit'): exit
