In [None]:
import pandas as pd
import json
import nltk
from transformers import pipeline

nltk.download('punkt')

# Load LLM for text generation (example: BanglaBERT)
text_generator = pipeline("text-generation", model="csebuetnlp/banglabert")

def generate_passage(text):
    if len(text) < 100:
        return None
    try:
        generated = text_generator(text, max_new_tokens=150, num_return_sequences=1)
        return generated[0]['generated_text'] if generated else text
    except Exception as e:
        print(f"Error generating passage: {e}")
        return text

def generate_title(text):
    return text[:50] + "..." if len(text) > 50 else text

def process_data(data):
    processed = []
    for entry in data:
        content = entry.get("content", "").strip()
        if len(content) >= 100:
            passage = generate_passage(content)
            if passage:
                title = generate_title(passage)
                processed.append({
                    "type": entry["type"],
                    "content": content,
                    "title": title,
                    "passage": passage
                })
    return processed

def process_csv(csv_file):
    df = pd.read_csv(csv_file)
    data = df.to_dict(orient='records')
    processed = process_data(data)
    processed_df = pd.DataFrame(processed)
    processed_df.to_csv("processed_output.csv", index=False)
    print("CSV Processing Done!")

# Example Usage
process_csv("cleaned_dataset.csv")


In [None]:
import pandas as pd 

In [None]:
df = pd.read_csv("merged_data.csv")
print(df.head())



In [None]:
data = pd.read_csv("processed_output.csv")
print(data.head(20))

In [None]:
import pandas as pd
import json
import nltk
from transformers import pipeline
from datasets import Dataset

nltk.download('punkt')

# Load LLM for text generation (BanglaBERT or alternative Bangla models)
text_generator = pipeline("text-generation", model="csebuetnlp/banglabert", device=0)

def truncate_text(text, max_tokens=512):
    """Ensures text is within model token limits."""
    return text[:max_tokens]

def generate_passage_batch(batch):
    """Batch process passages using the text-generation model."""
    passages = []
    for text in batch['content']:
        if len(text) < 100:
            passages.append(None)
        else:
            try:
                text = truncate_text(text, 512)  # Truncate long texts
                generated = text_generator(text, max_new_tokens=150, num_return_sequences=1)
                passage = generated[0]['generated_text'] if generated else text
                passages.append(passage)
            except Exception as e:
                print(f"Error generating passage: {e}")
                passages.append(text)
    return {"passage": passages}

def generate_title_batch(batch):
    """Generate titles from the first 50 characters."""
    return {"title": [text[:50] + "..." if len(text) > 50 else text for text in batch['passage']]}

def process_csv(csv_file):
    df = pd.read_csv(csv_file)
    df = df[df['content'].str.len() >= 100]  # Filter short content
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(generate_passage_batch, batched=True, batch_size=8)
    dataset = dataset.map(generate_title_batch, batched=True, batch_size=8)
    processed_df = dataset.to_pandas()
    processed_df.to_csv("processed.csv", index=False)
    print("CSV Processing Done!")

# Example Usage
process_csv("merged_data.csv")


In [1]:
import pandas as pd
import json
import nltk
from transformers import pipeline

nltk.download('punkt')

# Load LLM compatible with BanglaBERT (use fill-mask since it's not for text-generation)
text_generator = pipeline("fill-mask", model="csebuetnlp/banglabert")

def generate_passage(text):
    # Modify the sentence to insert a mask for fill-mask pipeline
    try:
        words = nltk.word_tokenize(text)
        if len(words) < 20:
            return None
        midpoint = len(words) // 2
        words[midpoint] = text_generator.tokenizer.mask_token
        masked_text = " ".join(words)
        result = text_generator(masked_text)
        if result:
            # Replace mask with top prediction
            filled = masked_text.replace(text_generator.tokenizer.mask_token, result[0]["token_str"])
            return filled
        else:
            return text
    except Exception as e:
        print(f"Error generating passage: {e}")
        return text

def generate_title(text):
    return text[:50] + "..." if len(text) > 50 else text

def process_data(data):
    processed = []
    for entry in data:
        content = entry.get("content", "").strip()
        if len(content) >= 100:
            passage = generate_passage(content)
            if passage:
                title = generate_title(passage)
                processed.append({
                    "type": entry.get("type", "unknown"),
                    "content": content,
                    "title": title,
                    "passage": passage
                })
    return processed

def process_csv(csv_file):
    df = pd.read_csv(csv_file)
    data = df.to_dict(orient='records')
    processed = process_data(data)
    processed_df = pd.DataFrame(processed)
    processed_df.to_csv("processed_output1.csv", index=False)
    print("✅ CSV Processing Done!")

# Example Usage
process_csv("dataset.csv")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USERAS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['generator_lm_head.bias', 'generator_predictions.LayerNorm.bias', 'generator_predictions.LayerNorm.weight', 'generator_predictions.dense.bias', 'generator_predictions.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


✅ CSV Processing Done!
