In [2]:
# File Path: train_rag_model.ipynb

# Step 1: Install Required Libraries
# pip install faiss-cpu transformers sentence-transformers torch pandas numpy

import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Step 2: Load Cleaned Data
# Load preprocessed merged schemes data
merged_data = pd.read_csv("../dataset/Merged_Schemes_Data.csv")

# Combine relevant text fields for embedding generation
merged_data['Full_Text'] = (
    merged_data['Description'] + " " +
    merged_data['Benefits'] + " " +
    merged_data['Flaws']
)

# Step 3: Generate Embeddings
# Use Sentence-BERT for text embedding
embed_model = SentenceTransformer('all-MiniLM-L6-v2')  # Compact Sentence-BERT model
corpus = merged_data['Full_Text'].tolist()
corpus_embeddings = embed_model.encode(corpus, convert_to_tensor=True)

# Convert embeddings to NumPy array for FAISS
corpus_embeddings_np = corpus_embeddings.cpu().numpy()

# Step 4: Create FAISS Index
dimension = corpus_embeddings_np.shape[1]  # Embedding dimension
index = faiss.IndexFlatL2(dimension)  # L2 distance metric for similarity
index.add(corpus_embeddings_np)  # Add embeddings to the index

# Save FAISS index for reuse
faiss.write_index(index, "faiss_index.bin")

# Step 5: Load Pre-trained Generative Model
# Use a pre-trained model like T5 or BART for generation
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Step 6: Define RAG Pipeline
def generate_response(query, top_k=3):
    # Step 6.1: Generate query embedding
    query_embedding = embed_model.encode([query], convert_to_tensor=True).cpu().numpy()

    # Step 6.2: Retrieve top-k results from FAISS
    distances, indices = index.search(query_embedding, top_k)
    retrieved_texts = [corpus[i] for i in indices[0]]

    # Step 6.3: Prepare generative model input
    combined_context = " ".join(retrieved_texts)
    input_text = f"Context: {combined_context} Query: {query}"
    inputs = tokenizer.encode(input_text, return_tensors="pt")

    # Step 6.4: Generate response
    outputs = model.generate(inputs, max_length=100, num_beams=5, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example Query
query = "What are the benefits of schemes for pregnant women under 19 years?"
response = generate_response(query)
print("Generated Response:", response)

# Step 7: Save and Load FAISS Index
# Save index and corresponding data for retrieval
faiss.write_index(index, "faiss_index_cpu.bin")
merged_data.to_csv("faiss_corpus_data.csv", index=False)

# To reload the index
loaded_index = faiss.read_index("faiss_index_cpu.bin")


KeyboardInterrupt: 