In [1]:
import pandas as pd
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer

# Load the processed CSV
file_path = "processed_output1.csv"  # Change path if needed
df = pd.read_csv(file_path)

# Load a pre-trained Bengali model for question answering or question generation (e.g., BanglaT5)
tokenizer = T5Tokenizer.from_pretrained("csebuetnlp/banglat5_banglaparaphrase")  # Example model, replace with your chosen model
model = T5ForConditionalGeneration.from_pretrained("csebuetnlp/banglat5_banglaparaphrase")

# Initialize Hugging Face pipeline for text generation (Question Generation)
qa_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Function to generate Q&A pairs using LLM
def generate_qa_pairs_llm(text):
    # Generate a question from the given content
    question_input = f"Generate a question based on the following text: {text}"
    questions = qa_generator(question_input, max_length=100, num_return_sequences=1)
    
    # For each question, generate an answer (using a question-answering task)
    question = questions[0]['generated_text']
    answer_input = f"Question: {question} Answer the question based on the following text: {text}"
    answers = qa_generator(answer_input, max_length=200, num_return_sequences=1)
    
    # Extract the generated question and answer
    answer = answers[0]['generated_text']
    
    return [{"question": question, "answer": answer}]

# List to hold final Q&A dataset
qa_data = []

# Iterate through each row to generate Q&A using LLM
for idx, row in df.iterrows():
    # Use passage if it's valid, otherwise fallback to content
    source_text = row["passage"] if isinstance(row["passage"], str) and len(row["passage"]) > 100 else row["content"]
    
    # Generate Q&A pairs using LLM
    qa_pairs = generate_qa_pairs_llm(source_text)
    
    # Append generated Q&A pairs to the dataset
    for qa in qa_pairs:
        qa_data.append({
            "id": idx + 1,
            "original_title": row["title"],
            "source_text": source_text,
            "question": qa["question"],
            "answer": qa["answer"]
        })

# Create final Q&A DataFrame
qa_df = pd.DataFrame(qa_data)

# Save to CSV
output_file = "bengali_history_qa_dataset_llm.csv"
qa_df.to_csv(output_file, index=False)
print(f"✅ Q&A dataset saved to {output_file}")


  from .autonotebook import tqdm as notebook_tqdm


EmptyDataError: No columns to parse from file