In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm

# Load your data
df = pd.read_csv("sData\df_temp.csv")  # make sure it has 'type' and 'content' columns

# Load BanglaBERT tokenizer & model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("csebuetnlp/banglabert")
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert")

# Setup sentiment pipeline
sentiment_pipe = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# For title/context generation, use a multilingual summarizer (mBART/mT5)
summarizer = pipeline("summarization", model="google/mt5-small", tokenizer="google/mt5-small", device=0 if torch.cuda.is_available() else -1)

# For content classification (news/story/etc.) – using text classification with mT5
# You can fine-tune later for better accuracy
classifier_pipe = pipeline("text-classification", model="csebuetnlp/banglabert", tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Helper functions
def get_sentiment(text):
    try:
        result = sentiment_pipe(text[:512])[0]['label'].lower()
        return result if result in ['positive', 'neutral', 'negative'] else 'neutral'
    except:
        return 'neutral'

def get_title(text):
    try:
        result = summarizer(text[:512], max_length=15, min_length=5, do_sample=False)[0]['summary_text']
        return result.strip().replace("\n", " ")
    except:
        return text[:30]

def get_context(text):
    try:
        result = summarizer(text[:512], max_length=40, min_length=10, do_sample=False)[0]['summary_text']
        return result.strip().replace("\n", " ")
    except:
        return text[:60]

def guess_class(text):
    text_lower = text.lower()
    if any(word in text_lower for word in ['ইতিহাস', 'রাজনীতি', 'যুদ্ধ']):
        return 'history'
    elif any(word in text_lower for word in ['দর্শন', 'চিন্তা', 'নীতি']):
        return 'philosophy'
    elif any(word in text_lower for word in ['সংবাদ', 'ঘটনা', 'খবর']):
        return 'news'
    elif any(word in text_lower for word in ['গল্প', 'উপন্যাস', 'কবিতা']):
        return 'story'
    elif any(word in text_lower for word in ['পাঠ্যপুস্তক', 'শিক্ষা', 'শিক্ষাবিষয়ক']):
        return 'text_book'
    else:
        return 'news'  # default fallback

# Apply processing
tqdm.pandas(desc="Processing Rows")

df["title"] = df["content"].progress_apply(get_title)
df["context"] = df["content"].progress_apply(get_context)
df["label"] = df["content"].progress_apply(get_sentiment)
df["class"] = df["content"].progress_apply(guess_class)

# Final shape
df_final = df[["type", "content", "title", "context", "label", "class"]]

# Save to CSV
df_final.to_csv("preprocessed_bengali_data.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0
Some weights of ElectraForSequenceClassification were not initialize

In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from tqdm import tqdm
import re

# Load your data
df = pd.read_csv("sData/df_temp.csv")  # Ensure 'type' and 'content' columns exist

# Load sentiment model (multilingual, star-based)
sentiment_model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
sentiment_pipe = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer, device=0 if torch.cuda.is_available() else -1)

# Bengali character check
def is_bengali(text):
    return bool(re.search(r'[\u0980-\u09FF]', str(text)))

# Convert star rating to sentiment label
def convert_star_to_label(star):
    if star in [1, 2]:
        return "negative"
    elif star == 3:
        return "neutral"
    else:
        return "positive"

# Sentiment extraction
def get_sentiment(text):
    try:
        result = sentiment_pipe(text[:512])[0]
        stars = int(result['label'][0])
        return convert_star_to_label(stars)
    except:
        return "neutral"

# Extract title (first Bengali sentence or fallback)
def extract_title(text):
    text = str(text).strip()
    sentences = re.split(r'[।\n]', text)
    for sent in sentences:
        if is_bengali(sent) and len(sent.strip()) > 10:
            return sent.strip()
    return text[:30].strip()

# Extract context (first 1-2 Bengali sentences or fallback)
def extract_context(text):
    text = str(text).strip()
    sentences = re.split(r'[।\n]', text)
    bengali_sentences = [s.strip() for s in sentences if is_bengali(s) and len(s.strip()) > 10]
    if len(bengali_sentences) >= 2:
        return "। ".join(bengali_sentences[:2]) + "।"
    elif bengali_sentences:
        return bengali_sentences[0] + "।"
    return text[:60]

# Guess class from keywords
def guess_class(text):
    text_lower = text.lower()
    if any(word in text_lower for word in ['ইতিহাস', 'রাজনীতি', 'যুদ্ধ']):
        return 'history'
    elif any(word in text_lower for word in ['দর্শন', 'চিন্তা', 'নীতি']):
        return 'philosophy'
    elif any(word in text_lower for word in ['সংবাদ', 'ঘটনা', 'খবর']):
        return 'news'
    elif any(word in text_lower for word in ['গল্প', 'উপন্যাস', 'কবিতা']):
        return 'story'
    elif any(word in text_lower for word in ['পাঠ্যপুস্তক', 'শিক্ষা', 'শিক্ষাবিষয়ক']):
        return 'text_book'
    else:
        return 'news'  # fallback default

# Apply all processing
tqdm.pandas(desc="Processing Rows")

df["title"] = df["content"].progress_apply(extract_title)
df["context"] = df["content"].progress_apply(extract_context)
df["label"] = df["content"].progress_apply(get_sentiment)
df["class"] = df["content"].progress_apply(guess_class)

# Final DataFrame
df_final = df[["type", "content", "title", "context", "label", "class"]]
df_final.to_csv("preprocessed_data.csv", index=False)


Device set to use cuda:0
Processing Rows: 100%|██████████| 4350/4350 [00:00<00:00, 345959.70it/s]
Processing Rows: 100%|██████████| 4350/4350 [00:00<00:00, 210815.32it/s]
Processing Rows: 100%|██████████| 4350/4350 [00:27<00:00, 159.31it/s]
Processing Rows: 100%|██████████| 4350/4350 [00:00<00:00, 243487.15it/s]


In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from tqdm import tqdm
import re

# Load your data
df = pd.read_csv("cleaned_data.csv")  # Ensure 'type' and 'content' columns exist

# Load sentiment model (multilingual, star-based)
sentiment_model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
sentiment_pipe = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer, device=0 if torch.cuda.is_available() else -1)

# Bengali character check
def is_bengali(text):
    return bool(re.search(r'[\u0980-\u09FF]', str(text)))

# Convert star rating to sentiment label
def convert_star_to_label(star):
    if star in [1, 2]:
        return "negative"
    elif star == 3:
        return "neutral"
    else:
        return "positive"

# Sentiment extraction
def get_sentiment(text):
    try:
        result = sentiment_pipe(text[:512])[0]
        stars = int(result['label'][0])
        return convert_star_to_label(stars)
    except:
        return "neutral"

# Extract title (first Bengali sentence or fallback)
def extract_title(text):
    text = str(text).strip()
    sentences = re.split(r'[।\n]', text)
    for sent in sentences:
        if is_bengali(sent) and len(sent.strip()) > 10:
            return sent.strip()
    return text[:30].strip()

# Extract context (first 1-2 Bengali sentences or fallback)
def extract_context(text):
    text = str(text).strip()
    sentences = re.split(r'[।\n]', text)
    bengali_sentences = [s.strip() for s in sentences if is_bengali(s) and len(s.strip()) > 10]
    if len(bengali_sentences) >= 2:
        return "। ".join(bengali_sentences[:2]) + "।"
    elif bengali_sentences:
        return bengali_sentences[0] + "।"
    return text[:60]

# Guess class from keywords
def guess_class(text):
    text_lower = text.lower()
    if any(word in text_lower for word in ['ইতিহাস', 'রাজনীতি', 'যুদ্ধ']):
        return 'history'
    elif any(word in text_lower for word in ['দর্শন', 'চিন্তা', 'নীতি']):
        return 'philosophy'
    elif any(word in text_lower for word in ['সংবাদ', 'ঘটনা', 'খবর']):
        return 'news'
    elif any(word in text_lower for word in ['গল্প', 'উপন্যাস', 'কবিতা']):
        return 'story'
    elif any(word in text_lower for word in ['পাঠ্যপুস্তক', 'শিক্ষা', 'শিক্ষাবিষয়ক']):
        return 'text_book'
    else:
        return 'news'  # fallback default

# Apply all processing
tqdm.pandas(desc="Processing Rows")

df["title"] = df["content"].progress_apply(extract_title)
df["context"] = df["content"].progress_apply(extract_context)
df["label"] = df["content"].progress_apply(get_sentiment)
df["class"] = df["content"].progress_apply(guess_class)

# Final DataFrame
df_final = df[["type", "content", "title", "context", "label", "class"]]
df_final.to_csv("final_data.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0
Processing Rows: 100%|██████████| 144995/144995 [00:00<00:00, 375866.49it/s]
Processing Rows: 100%|██████████| 144995/144995 [00:00<00:00, 218077.17it/s]
Processing Rows:   0%|          | 2/144995 [00:00<2:39:41, 15.13it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing Rows: 100%|██████████| 144995/144995 [15:14<00:00, 158.60it/s]
Processing Rows: 100%|██████████| 144995/144995 [00:00<00:00, 262730.75it/s]
