In [3]:
import pandas as pd

# Load the processed CSV
file_path = "sData/df_temp.csv"  # Adjust if needed
df = pd.read_csv(file_path)

# Function to generate 5 Bengali Q&A pairs from a given text
def generate_qa_pairs(text):
    questions = [
        "এই লেখার মূল বিষয়বস্তু কী?",
        "এতে কী ঐতিহাসিক ঘটনা বর্ণনা করা হয়েছে?",
        "লেখাটি কোন সময়কাল নিয়ে আলোচনা করে?",
        "এতে উল্লেখযোগ্য ব্যক্তি বা ঘটনাবলি কী কী?",
        "এর প্রেক্ষাপট কী ছিল?"
    ]
    qa_pairs = [{"question": q, "answer": text.strip()} for q in questions]
    return qa_pairs

# List to hold final Q&A dataset
qa_data = []

# Iterate through each row to generate Q&A
for idx, row in df.iterrows():
    source_text = str(row["content"]).strip()
    
    # Skip very short content
    if len(source_text) < 50 or pd.isna(source_text):
        continue

    qa_pairs = generate_qa_pairs(source_text)
    for qa in qa_pairs:
        qa_data.append({
            "id": idx + 1,
            "type": row["type"],
            "content": source_text,
            "question": qa["question"],
            "answer": qa["answer"]
        })

# Create final Q&A DataFrame
qa_df = pd.DataFrame(qa_data)

# Save to CSV
output_file = "processed_bengali_qa.csv"
qa_df.to_csv(output_file, index=False)
print(f"Q&A dataset saved to {output_file}")


Q&A dataset saved to processed_bengali_qa.csv


In [5]:
import pandas as pd
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer

# Load the processed CSV
file_path = "sData/temp2.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Load BanglaT5 model for Q&A generation
tokenizer = T5Tokenizer.from_pretrained("csebuetnlp/banglat5_banglaparaphrase")
model = T5ForConditionalGeneration.from_pretrained("csebuetnlp/banglat5_banglaparaphrase")

# Initialize pipeline
qa_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Function to generate a Q&A pair from content
def generate_qa_pairs_llm(text):
    try:
        text = text.strip()
        if len(text) < 50:
            return []

        # Generate a question
        question_input = f"এই পাঠ্যাংশ থেকে একটি প্রশ্ন তৈরি করো: {text}"
        questions = qa_generator(question_input, max_length=100, num_return_sequences=1)

        # Generate answer for the question
        question = questions[0]['generated_text'].strip()
        answer_input = f"প্রশ্ন: {question}। এই লেখার ভিত্তিতে উত্তর দাও: {text}"
        answers = qa_generator(answer_input, max_length=200, num_return_sequences=1)

        answer = answers[0]['generated_text'].strip()

        return [{"question": question, "answer": answer}]
    except Exception as e:
        print(f"Error processing text: {e}")
        return []

# List to collect Q&A data
qa_data = []

# Process each row
for idx, row in df.iterrows():
    content = str(row["content"]).strip()
    if not content or len(content) < 50:
        continue

    qa_pairs = generate_qa_pairs_llm(content)
    for qa in qa_pairs:
        qa_data.append({
            "id": idx + 1,
            "type": row["type"],
            "content": content,
            "question": qa["question"],
            "answer": qa["answer"]
        })

# Create DataFrame and save
qa_df = pd.DataFrame(qa_data)
output_file = "bengali_qa.csv"
qa_df.to_csv(output_file, index=False)
print(f"Q&A dataset saved to {output_file}")


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors


Q&A dataset saved to bengali_qa.csv


In [2]:
import pandas as pd
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
import random

# Load the processed CSV (must have 'type' and 'content' columns)
file_path = "sData/temp2.csv"
df = pd.read_csv(file_path)

# Load BanglaT5 model for Q&A generation
model_name = "csebuetnlp/banglat5_banglaparaphrase"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Initialize pipeline
qa_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

# Function to generate Q&A pairs using LLM
def generate_qa_pairs_llm(text, num_questions=3):
    try:
        text = text.strip()
        if len(text) < 50:
            return []

        qa_pairs = []

        for _ in range(num_questions):
            # Randomly alter prompt wording
            prompt_options = [
                f"এই লেখার ভিত্তিতে একটি প্রশ্ন তৈরি করো: {text}",
                f"এই অনুচ্ছেদ থেকে প্রশ্ন তৈরি করো: {text}",
                f"এই পাঠ্যাংশ অনুসারে একটি প্রশ্ন দাও: {text}"
            ]
            question_input = random.choice(prompt_options)

            # Generate a question
            questions = qa_generator(question_input, max_length=50, num_return_sequences=1)
            question = questions[0]['generated_text'].strip()

            if not question or len(question.split()) < 2:
                continue

            # Generate answer
            answer_input = f"প্রশ্ন: {question}। এই লেখার ভিত্তিতে সংক্ষেপে উত্তর দাও: {text}"
            answers = qa_generator(answer_input, max_length=50, num_return_sequences=1)
            answer = answers[0]['generated_text'].strip()

            if not answer or answer.lower() in text.lower():
                continue  # Avoid copying from content

            # Keep short answers
            if len(answer.split()) > 20:
                continue

            qa_pairs.append({"question": question, "answer": answer})

        return qa_pairs
    except Exception as e:
        print(f"Error on text: {e}")
        return []

# Collect generated Q&A data
qa_data = []

for idx, row in df.iterrows():
    content = str(row["content"]).strip()
    if not content:
        continue

    qa_pairs = generate_qa_pairs_llm(content)

    if not qa_pairs:
        # Add a null row if no valid Q&A found
        qa_data.append({
            "id": idx + 1,
            "type": row["type"],
            "content": content,
            "question": None,
            "answer": None
        })
    else:
        for qa in qa_pairs:
            qa_data.append({
                "id": idx + 1,
                "type": row["type"],
                "content": content,
                "question": qa["question"],
                "answer": qa["answer"]
            })

# Save results
qa_df = pd.DataFrame(qa_data)
output_file = "bengali_llm_qa_output.csv"
qa_df.to_csv(output_file, index=False)
print(f"Bengali LLM Q&A dataset saved to: {output_file}")


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


Bengali LLM Q&A dataset saved to: bengali_llm_qa_output.csv
