In [None]:
# File Path: train_rag_model.ipynb

# Step 1: Install Required Libraries
# pip install faiss-cpu transformers sentence-transformers torch pandas numpy

import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Step 2: Load Cleaned Data
# Load preprocessed merged schemes data
merged_data = pd.read_csv("../dataset/Merged_Schemes_Data.csv")

# Combine relevant text fields for embedding generation
merged_data['Full_Text'] = (
    merged_data['Description'] + " " +
    merged_data['Benefits'] + " " +
    merged_data['Flaws']
)



In [9]:
# Step 1: Ensure 'Full_Text' Column is Valid
# Replace NaN or invalid entries with empty strings
merged_data['Full_Text'] = merged_data['Full_Text'].fillna("").astype(str)


In [10]:
print(merged_data['Full_Text'].isnull().sum())  # Number of NaN entries


0


In [11]:
print(merged_data['Full_Text'].head())  # Ensure the column contains valid strings


0    promotes institutional deliveries among poor p...
1    ensures at least one anc checkup by specialist...
2    provides free anc services in government facil...
3    focuses on improving quality in labor rooms an...
4    provides anc and delivery support for girl chi...
Name: Full_Text, dtype: object


In [13]:

# Step 2: Re-run Embedding Generation
# Generate embeddings using Sentence-BERT
embed_model = SentenceTransformer('all-MiniLM-L6-v2') 
corpus = merged_data['Full_Text'].tolist()  # List of text strings
corpus_embeddings = embed_model.encode(corpus, convert_to_tensor=True)  # Convert to tensors

# Convert embeddings to NumPy array for FAISS
corpus_embeddings_np = corpus_embeddings.cpu().numpy()


In [14]:
# Step 4: Create FAISS Index
dimension = corpus_embeddings_np.shape[1]  # Embedding dimension
index = faiss.IndexFlatL2(dimension)  # L2 distance metric for similarity
index.add(corpus_embeddings_np)  # Add embeddings to the index

# Save FAISS index for reuse
faiss.write_index(index, "faiss_index.bin")

# Step 5: Load Pre-trained Generative Model
# Use a pre-trained model like T5 or BART for generation
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Step 6: Define RAG Pipeline
def generate_response(query, top_k=3):
    # Step 6.1: Generate query embedding
    query_embedding = embed_model.encode([query], convert_to_tensor=True).cpu().numpy()

    # Step 6.2: Retrieve top-k results from FAISS
    distances, indices = index.search(query_embedding, top_k)
    retrieved_texts = [corpus[i] for i in indices[0]]

    # Step 6.3: Prepare generative model input
    combined_context = " ".join(retrieved_texts)
    input_text = f"Context: {combined_context} Query: {query}"
    inputs = tokenizer.encode(input_text, return_tensors="pt")

    # Step 6.4: Generate response
    outputs = model.generate(inputs, max_length=100, num_beams=5, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example Query
query = "What are the benefits of schemes for pregnant women under 19 years?"
response = generate_response(query)
print("Generated Response:", response)

# Step 7: Save and Load FAISS Index
# Save index and corresponding data for retrieval
faiss.write_index(index, "faiss_index_cpu.bin")
merged_data.to_csv("faiss_corpus_data.csv", index=False)

# To reload the index
loaded_index = faiss.read_index("faiss_index_cpu.bin")


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Generated Response: reduces unplanned pregnancies postpartum lack of trained staff for counseling provides contraceptive counseling and methods after delivery reduces unplanned pregnancies postpartum lack of trained staff for counseling Query: What are the benefits of schemes for pregnant women under 19 years?


In [15]:
def generate_response(query, top_k=3):
    # Generate query embedding
    query_embedding = embed_model.encode([query], convert_to_tensor=True).cpu().numpy()
    
    # Retrieve top-k results
    distances, indices = index.search(query_embedding, top_k)
    retrieved_texts = [corpus[i] for i in indices[0]]
    
    # Remove duplicate or repetitive sentences
    unique_texts = list(dict.fromkeys(retrieved_texts))
    
    # Combine into a single context (limit length if necessary)
    combined_context = " ".join(unique_texts[:top_k])  # Limit to top_k unique entries
    input_text = f"Context: {combined_context} Query: {query}"
    
    # Generate response
    inputs = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example query
query = "What are the benefits of schemes for pregnant women under 19 years?"
response = generate_response(query)
print("Improved Response:", response)


Improved Response: Context: provides contraceptive counseling and methods after delivery reduces unplanned pregnancies postpartum lack of trained staff for counseling Query: What are the benefits of schemes for pregnant women under 19 years?


**Creating Training Dataset**


In [17]:
import pandas as pd

# Example corpus data (use preprocessed Full_Text from Merged_Schemes_Data)
corpus = [
    "Provides prenatal counseling and nutritional support for pregnant women.",
    "Offers financial assistance to families under the poverty line.",
    "Focuses on reducing maternal mortality rates through regular checkups.",
    "Promotes vaccination for newborns to prevent early childhood diseases.",
    "Improves healthcare access in rural areas through mobile clinics."
]

# Example queries
queries = [
    "What benefits are available for pregnant women under government schemes?",
    "How are rural healthcare services being improved?",
    "What is being done to support families below the poverty line?",
    "How are childhood vaccination schemes helping reduce diseases?",
    "What measures are being taken to lower maternal mortality rates?"
]

# Simulated responses based on corpus and queries
responses = [
    "Government schemes provide prenatal counseling, nutritional support, and medical care to pregnant women, especially in underserved regions.",
    "Rural healthcare services are being improved through mobile clinics that provide regular medical checkups and basic treatments.",
    "Families below the poverty line receive financial aid, including free access to essential healthcare and medicines.",
    "Childhood vaccination schemes significantly reduce diseases by ensuring timely immunization for all newborns.",
    "To lower maternal mortality rates, regular medical checkups and emergency care services have been implemented."
]

# Generate dataset
fine_tune_data = []
for query, response, context in zip(queries, responses, corpus):
    input_text = f"Context: {context} Query: {query}"
    target_text = response
    fine_tune_data.append({"input_text": input_text, "target_text": target_text})

# Convert to DataFrame
fine_tune_df = pd.DataFrame(fine_tune_data)

# Save to CSV
fine_tune_df.to_csv("../dataset/fine_tune_dataset.csv", index=False)
print("Fine-tune dataset saved as fine_tune_dataset.csv")


Fine-tune dataset saved as fine_tune_dataset.csv


In [19]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

# Create FAISS index
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(corpus_embeddings)

# Step 3: Generate Queries Dynamically
queries = []
for _, row in merged_data.iterrows():
    queries.append(f"What are the benefits of schemes for {row['Category Name']}?")
    queries.append(f"How does the government address flaws in {row['Category Name']} schemes?")
    queries.append(f"What impact do schemes have on {row['Category Name']} trends?")


KeyboardInterrupt: 

In [20]:

# Step 4: Retrieve Context and Generate Responses
fine_tune_data = []
for query in queries:
    # Step 4.1: Retrieve top-3 contexts
    query_embedding = embed_model.encode([query], convert_to_tensor=True).cpu().numpy()
    distances, indices = index.search(query_embedding, k=3)
    retrieved_texts = [corpus[i] for i in indices[0]]

    # Step 4.2: Combine contexts
    combined_context = " ".join(retrieved_texts)

    # Step 4.3: Simulate a response (using Benefits as a proxy)
    response = " ".join(merged_data.loc[indices[0], 'Benefits'].fillna("").tolist())

    # Step 4.4: Prepare input-output pair
    input_text = f"Context: {combined_context} Query: {query}"
    target_text = response.strip() or "No relevant data available."
    fine_tune_data.append({"input_text": input_text, "target_text": target_text})

# Step 5: Save the Fine-Tune Dataset
fine_tune_df = pd.DataFrame(fine_tune_data)
fine_tune_df.to_csv("../dastaset/large_fine_tune_dataset.csv", index=False)

print("Large fine-tune dataset saved as large_fine_tune_dataset.csv")


KeyboardInterrupt: 

In [16]:
# Fine-tune the T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load dataset
# Fine-tuning dataset should have two columns: "input_text" (query + context) and "target_text" (response)
fine_tune_data = pd.read_csv("fine_tune_dataset.csv")

# Tokenize dataset
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Preprocess data
train_encodings = tokenizer(fine_tune_data['input_text'].tolist(), truncation=True, padding=True, max_length=512)
train_labels = tokenizer(fine_tune_data['target_text'].tolist(), truncation=True, padding=True, max_length=128)

# Prepare training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_total_limit=2,
    weight_decay=0.01
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=train_labels
)

# Train
trainer.train()


FileNotFoundError: [Errno 2] No such file or directory: 'fine_tune_dataset.csv'