## Deep learning Model to detect political bias in news articles

#### Library imports

In [27]:
import os
import torch
import pandas as pd
from sklearn.utils import resample
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer,
    TrainingArguments, pipeline
)
#from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from functools import lru_cache
import numpy as np
from typing import List, Union
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
import torch.nn as nn
import multiprocessing
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from parellel import parallel_combine_text
from functools import partial
from multiprocessing import Pool


#### Load Data
* Load folder and read files

In [3]:
## combine all files to single file for trainging model
folder_path = "./data_biasingmodel"
# ---------------- Config ----------------
File_path = "./data_biasingmodel/combined_data.xlsx"
LOGGING_DIR = "./data_biasingmodel/logs"
RESULTS_PATH = "./data_biasingmodel/predicted_bias_results.csv"
all_data = []

# Loop through all .xlsx files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx"):
        file_path = os.path.join(folder_path, filename)
        try:
            df = pd.read_excel(file_path)
            all_data.append(df)
        except Exception as e:
            print(f"Failed to read {filename}: {e}")

# Combine all the DataFrames
combined_df = pd.concat(all_data, ignore_index=True)

# Save to a single Excel or CSV file
combined_df.to_excel(os.path.join(folder_path, "combined_data.xlsx"), index=False)
combined_df.head()

  warn(msg)


Unnamed: 0,text,news_link,outlet,topic,type,group_id,num_sent,label_bias,label_opinion,article,biased_words,Label_bias_0-1,annotator_id,df_id
0,YouTube is making clear there will be no “birt...,https://eu.usatoday.com/story/tech/2020/02/03/...,usa-today,elections-2020,center,1.0,1.0,Biased,Somewhat factual but also opinionated,YouTube says no ‘deepfakes’ or ‘birther’ video...,"['belated', 'birtherism']",,,
1,So while there may be a humanitarian crisis dr...,https://www.alternet.org/2019/01/here-are-5-of...,alternet,immigration,left,1.0,1.0,Biased,Expresses writer’s opinion,Speaking to the country for the first time fro...,['crisis'],,,
2,"Looking around the United States, there is nev...",https://thefederalist.com/2020/03/11/woman-who...,federalist,abortion,right,1.0,1.0,Biased,Somewhat factual but also opinionated,The left has a thing for taking babies hostage...,"['killing', 'never', 'developing', 'humans', '...",,,
3,The Republican president assumed he was helpin...,http://www.msnbc.com/rachel-maddow-show/auto-i...,msnbc,environment,left,1.0,1.0,Biased,Expresses writer’s opinion,"In Barack Obama’s first term, the administrati...","['rejects', 'happy', 'assumed']",,,
4,The explosion of the Hispanic population has l...,https://www.breitbart.com/politics/2015/02/26/...,breitbart,student-debt,right,1.0,1.0,Biased,No agreement,"Republicans should stop fighting amnesty, Pres...",['explosion'],,,


In [25]:
LABEL_MAP = {
    "left": 0,
    "center": 1,
    "right": 2
}
NUM_PROCESSES = 10

In [20]:
def parallel_combine_text(row_chunk, text_cols):
    for col in text_cols:
        row_chunk[col] = row_chunk[col].fillna("")
    row_chunk["text"] = row_chunk[text_cols].agg(" ".join, axis=1).str.strip()
    return row_chunk

### Data Preprocessing and Modelling

In [28]:
# ---------------- Custom Model with Additional Self-Attention ----------------
class CustomModelWithAttention(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        # Load base transformer model
        self.base_model = AutoModel.from_pretrained(model_name)
        self.config = self.base_model.config
        self.num_labels = num_labels
        
        # Additional self-attention layer
        self.extra_attention = nn.MultiheadAttention(
            embed_dim=self.config.hidden_size,
            num_heads=8,
            dropout=0.1,
            batch_first=True
        )
        
        # Classifier
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
        
        # Layer norm and dropout
        self.layer_norm = nn.LayerNorm(self.config.hidden_size)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        
        # Initialize weights
        self.init_weights()
    
    def init_weights(self):
        for name, param in self.extra_attention.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0.0)
    
    def forward(self, input_ids, attention_mask=None, labels=None):
        # Base model forward pass
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        sequence_output = outputs.last_hidden_state
        
        # Additional self-attention
        attn_output, _ = self.extra_attention(
            query=sequence_output,
            key=sequence_output,
            value=sequence_output,
            key_padding_mask=~attention_mask.bool() if attention_mask is not None else None
        )
        
        # Residual connection + layer norm
        sequence_output = self.layer_norm(sequence_output + attn_output)
        
        # Take [CLS] token representation for classification
        pooled_output = sequence_output[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return {'loss': loss, 'logits': logits}

# ---------------- Parallel Processing Functions ----------------
def parallel_preprocess_text(text):
    return ' '.join(str(t).lower() for t in text if isinstance(t, str)) if isinstance(text, list) else str(text).lower()

def parallel_combine_text(df_chunk, text_cols):
    for col in text_cols:
        df_chunk[col] = df_chunk[col].apply(parallel_preprocess_text) if col in df_chunk else ""
    df_chunk["combined_input"] = df_chunk[text_cols].fillna("").agg(" ".join, axis=1)
    return df_chunk

def parallel_tokenize(batch, tokenizer):
    return tokenizer(batch["combined_input"], padding="max_length", truncation=True, max_length=512)

    # ---------------- Load & Rebalance (Parallel) ----------------
def load_and_rebalance(df_combined):
    # Use the combined DataFrame directly
    df = df_combined[df_combined["type"].isin(LABEL_MAP.keys())].copy()
    
    print("Original distribution:")
    print(df["type"].value_counts())
    print(f"Total samples: {len(df)}")
    
    # Calculate target counts based on your dataset size
    total_samples = len(df)
    # Option 2: Custom balancing 
    df_balanced = rebalance_three_categories_fixed_length(
        df,
        bias_factor=0.35,      # Center bias (35%)
        target_type='center',   # Focus on center
        final_length=len(df['type'])      # Larger final dataset size
    )
    
    print("\nBalanced distribution:")
    print(df_balanced["type"].value_counts())
    print(f"Total balanced samples: {len(df_balanced)}")
    
    # Parallel text processing for large dataset
    text_cols = ["text", "topic", "article", "biased_words"]
    chunk_size = len(df_balanced) // NUM_PROCESSES
    
    # Process in parallel chunks
    with ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:
        chunks = [df_balanced.iloc[i:i + chunk_size] for i in range(0, len(df_balanced), chunk_size)]
        processed_chunks = list(executor.map(
            partial(parallel_combine_text, text_cols=text_cols),
            chunks
        ))
    
    df_processed = pd.concat(processed_chunks)
    df_processed["label"] = df_processed["type"].map(LABEL_MAP)
    return df_processed


# ---------------- Main Pipeline ----------------
def main():
    # Load and prepare data (parallel)
    df = load_and_rebalance(df_combined)
    
    # Rest of your pipeline remains the same...
    # Train-Test Split
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
    
    # Convert to Dataset objects
    train_ds = Dataset.from_pandas(train_df[["combined_input", "label"]].reset_index(drop=True))
    eval_ds = Dataset.from_pandas(eval_df[["combined_input", "label"]].reset_index(drop=True))


# ---------------- Rebalancing Function ----------------
def rebalance_three_categories_fixed_length(df, bias_factor=0.35, target_type='center', final_length=None):
    if final_length is None:
        final_length = len(df)

    # Separate target and other types
    df_target = df[df['type'] == target_type]
    df_other = df[df['type'] != target_type]

    # Compute desired target count
    n_target = int(final_length * bias_factor)
    n_other_total = final_length - n_target
    n_per_other = n_other_total // (df['type'].nunique() - 1)

    # Resample the target type
    if len(df_target) >= n_target:
        df_target_balanced = resample(df_target, n_samples=n_target, replace=False, random_state=42)
    else:
        df_target_balanced = resample(df_target, n_samples=n_target, replace=True, random_state=42)

    # Resample each other class
    other_types = [t for t in df['type'].unique() if t != target_type]
    df_others_balanced = []
    
    with ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:
        futures = []
        for t in other_types:
            df_t = df[df['type'] == t]
            futures.append(executor.submit(
                resample, df_t, 
                n_samples=n_per_other, 
                replace=len(df_t) < n_per_other,
                random_state=42
            ))
        
        for future in futures:
            df_others_balanced.append(future.result())

    # Combine all and shuffle
    df_final = pd.concat([df_target_balanced] + df_others_balanced)
    df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
    return df_final

# ---------------- Load & Rebalance (Parallel) ----------------
def load_and_rebalance(file_path):
    # Parallel CSV reading
    df = pd.read_excel(file_path, engine='openpyxl')
    df = df[df["type"].isin(LABEL_MAP.keys())]
    
    print("Original distribution:")
    print(df["type"].value_counts())
    
    # Parallel rebalancing
    df_balanced = rebalance_three_categories_fixed_length(
        df,
        bias_factor=0.35,
        target_type='center',
        final_length=len(df['type'])
    )
    
    print("\nBalanced distribution:")
    print(df_balanced["type"].value_counts())
    
    # Parallel text processing
    text_cols = ["text", "topic", "article", "biased_words"]
    chunk_size = len(df_balanced) // NUM_PROCESSES
    chunks = [df_balanced.iloc[i:i + chunk_size] for i in range(0, len(df_balanced), chunk_size)]
    
    with ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:
        processed_chunks = list(executor.map(
            partial(parallel_combine_text, text_cols=text_cols),
            chunks
        ))
    
    df_processed = pd.concat(processed_chunks)
    df_processed["label"] = df_processed["type"].map(LABEL_MAP)
    return df_processed

# ---------------- Metrics ----------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ---------------- Main Pipeline ----------------
def main():
    # Load and prepare data (parallel)
    df = load_and_rebalance(file_path)
    
    # Train-Test Split
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
    
    # Convert to Dataset objects
    train_ds = Dataset.from_pandas(train_df[["combined_input", "label"]].reset_index(drop=True))
    eval_ds = Dataset.from_pandas(eval_df[["combined_input", "label"]].reset_index(drop=True))

    # Load Tokenizer and Custom Model
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = CustomModelWithAttention(MODEL_NAME, num_labels=3)
    except:
        print(f"Model {MODEL_NAME} not found, using fallback model")
        fallback = "facebook/roberta-hate-speech-dynabench-r4-target"
        tokenizer = AutoTokenizer.from_pretrained(fallback)
        model = CustomModelWithAttention(fallback, num_labels=3)

    # Parallel tokenization
    tokenize_fn = partial(parallel_tokenize, tokenizer=tokenizer)
    train_ds = train_ds.map(tokenize_fn, batched=True, batch_size=1000, num_proc=NUM_PROCESSES)
    eval_ds = eval_ds.map(tokenize_fn, batched=True, batch_size=1000, num_proc=NUM_PROCESSES)
    
    train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    # Training arguments with multiprocessing
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        logging_dir=LOGGING_DIR,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=50,
        save_steps=500,
        evaluation_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        remove_unused_columns=False,
        dataloader_num_workers=NUM_PROCESSES,  # Parallel data loading
        fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU available
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=default_data_collator,
    )

    # Train and save
    trainer.train()
    model.save_pretrained(folder_path)
    tokenizer.save_pretrained(folder_path)

    # Parallel prediction
    full_ds = Dataset.from_pandas(df[["combined_input"]].reset_index(drop=True))
    full_ds = full_ds.map(tokenize_fn, batched=True, batch_size=1000, num_proc=NUM_PROCESSES)
    full_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])
    
    preds = trainer.predict(full_ds)
    df["predicted_bias_category"] = [REVERSE_LABEL_MAP[i] for i in torch.argmax(torch.tensor(preds.predictions), dim=1).numpy()]
    df.to_csv(RESULTS_PATH, index=False)

if __name__ == "__main__":
    # Set multiprocessing start method for Unix systems
    pool = Pool()
    # # your code here
    # multiprocessing.set_start_method('spawn', force=True)
    main()

Original distribution:
type
right     4965
left      4953
center    3464
Name: count, dtype: int64

Balanced distribution:
type
center    4683
left      4349
right     4349
Name: count, dtype: int64


Process SpawnProcess-39:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.11/concurrent/futures/process.py", line 249, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'parallel_combine_text' on <module '__main__' (built-in)>
Process SpawnProcess-40:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target

OSError: handle is closed

In [None]:
# ---------------- Config ----------------

MODEL_NAME = "launch/POLITICS"  # fallback model will be handled
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

# ---------------- Load & Rebalance ----------------
def load_and_rebalance(data):
    df = data
    df = df[df["type"].isin(LABEL_MAP.keys())]

    # Rebalance to equal left, center, right
    min_count = min(df["type"].value_counts()[label] for label in LABEL_MAP)
    df_list = [resample(df[df["type"] == label], n_samples=min_count, random_state=42) for label in LABEL_MAP]
    return pd.concat(df_list).sample(frac=1, random_state=42).reset_index(drop=True)

# ---------------- Preprocessing ----------------
def preprocess_text(text):
    return ' '.join(str(t).lower() for t in text if isinstance(t, str)) if isinstance(text, list) else str(text).lower()

def combine_text_columns(df, text_cols):
    for col in text_cols:
        df[col] = df[col].apply(preprocess_text) if col in df else ""
    df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

In [None]:


# ---------------- Tokenization ----------------
def tokenize_dataset(dataset, tokenizer):
    return dataset.map(lambda x: tokenizer(x["combined_input"], padding="max_length", truncation=True, max_length=512), batched=True)

# ---------------- Metrics ----------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ---------------- Main Pipeline ----------------
def main():
    df = load_and_rebalance(FILE_PATH)
    df = combine_text_columns(df, ["text", "topic", "article", "biased_words"])
    df["label"] = df["type"].map(LABEL_MAP)

    # Train-Test Split
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
    train_ds = Dataset.from_pandas(train_df[["combined_input", "label"]].reset_index(drop=True))
    eval_ds = Dataset.from_pandas(eval_df[["combined_input", "label"]].reset_index(drop=True))

    # Load Model & Tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
    except:
        fallback = "facebook/roberta-hate-speech-dynabench-r4-target"
        tokenizer = AutoTokenizer.from_pretrained(fallback)
        model = AutoModelForSequenceClassification.from_pretrained(fallback, num_labels=3)

    train_ds = tokenize_dataset(train_ds, tokenizer)
    eval_ds = tokenize_dataset(eval_ds, tokenizer)
    train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        logging_dir=LOGGING_DIR,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=50,
        remove_unused_columns=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

    # Predict on full dataset
    full_ds = Dataset.from_pandas(df[["combined_input"]].reset_index(drop=True))
    full_ds = tokenize_dataset(full_ds, tokenizer)
    full_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])
    preds = trainer.predict(full_ds)
    df["predicted_bias_category"] = [REVERSE_LABEL_MAP[i] for i in torch.argmax(torch.tensor(preds.predictions), dim=1).numpy()]
    df.to_csv(RESULTS_PATH, index=False)

if __name__ == "__main__":
    main()


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from datasets import Dataset, load_dataset
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer,
    TrainingArguments, default_data_collator
)
import multiprocessing
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from functools import partial

# ---------------- Config ----------------
FILE_PATH = "/content/drive/MyDrive/internship_vaali_infotech/final_labels_MBIC.xlsx"
MODEL_NAME = "launch/POLITICS"  # fallback model will be handled
OUTPUT_DIR = "/content/drive/MyDrive/internship_vaali_infotech/finetuned_politics_3class"
LOGGING_DIR = "/content/drive/MyDrive/internship_vaali_infotech/logs"
RESULTS_PATH = "/content/drive/MyDrive/internship_vaali_infotech/predicted_bias_results.csv"
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
NUM_PROCESSES = multiprocessing.cpu_count()  # Use all available cores

# ---------------- Custom Model with Additional Self-Attention ----------------
class CustomModelWithAttention(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        # Load base transformer model
        self.base_model = AutoModel.from_pretrained(model_name)
        self.config = self.base_model.config
        self.num_labels = num_labels
        
        # Additional self-attention layer
        self.extra_attention = nn.MultiheadAttention(
            embed_dim=self.config.hidden_size,
            num_heads=8,
            dropout=0.1,
            batch_first=True
        )
        
        # Classifier
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
        
        # Layer norm and dropout
        self.layer_norm = nn.LayerNorm(self.config.hidden_size)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        
        # Initialize weights
        self.init_weights()
    
    def init_weights(self):
        for name, param in self.extra_attention.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0.0)
    
    def forward(self, input_ids, attention_mask=None, labels=None):
        # Base model forward pass
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        sequence_output = outputs.last_hidden_state
        
        # Additional self-attention
        attn_output, _ = self.extra_attention(
            query=sequence_output,
            key=sequence_output,
            value=sequence_output,
            key_padding_mask=~attention_mask.bool() if attention_mask is not None else None
        )
        
        # Residual connection + layer norm
        sequence_output = self.layer_norm(sequence_output + attn_output)
        
        # Take [CLS] token representation for classification
        pooled_output = sequence_output[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return {'loss': loss, 'logits': logits}

# ---------------- Parallel Processing Functions ----------------
def parallel_preprocess_text(text):
    return ' '.join(str(t).lower() for t in text if isinstance(t, str)) if isinstance(text, list) else str(text).lower()

def parallel_combine_text(df_chunk, text_cols):
    for col in text_cols:
        df_chunk[col] = df_chunk[col].apply(parallel_preprocess_text) if col in df_chunk else ""
    df_chunk["combined_input"] = df_chunk[text_cols].fillna("").agg(" ".join, axis=1)
    return df_chunk

def parallel_tokenize(batch, tokenizer):
    return tokenizer(batch["combined_input"], padding="max_length", truncation=True, max_length=512)

# ---------------- Rebalancing Function ----------------
def rebalance_three_categories_fixed_length(df, bias_factor=0.35, target_type='center', final_length=None):
    if final_length is None:
        final_length = len(df)

    # Separate target and other types
    df_target = df[df['type'] == target_type]
    df_other = df[df['type'] != target_type]

    # Compute desired target count
    n_target = int(final_length * bias_factor)
    n_other_total = final_length - n_target
    n_per_other = n_other_total // (df['type'].nunique() - 1)

    # Resample the target type
    if len(df_target) >= n_target:
        df_target_balanced = resample(df_target, n_samples=n_target, replace=False, random_state=42)
    else:
        df_target_balanced = resample(df_target, n_samples=n_target, replace=True, random_state=42)

    # Resample each other class
    other_types = [t for t in df['type'].unique() if t != target_type]
    df_others_balanced = []
    
    with ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:
        futures = []
        for t in other_types:
            df_t = df[df['type'] == t]
            futures.append(executor.submit(
                resample, df_t, 
                n_samples=n_per_other, 
                replace=len(df_t) < n_per_other,
                random_state=42
            ))
        
        for future in futures:
            df_others_balanced.append(future.result())

    # Combine all and shuffle
    df_final = pd.concat([df_target_balanced] + df_others_balanced)
    df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
    return df_final

# ---------------- Load & Rebalance (Parallel) ----------------
def load_and_rebalance(file_path):
    # Parallel CSV reading
    df = pd.read_excel(file_path, engine='openpyxl')
    df = df[df["type"].isin(LABEL_MAP.keys())]
    
    print("Original distribution:")
    print(df["type"].value_counts())
    
    # Parallel rebalancing
    df_balanced = rebalance_three_categories_fixed_length(
        df,
        bias_factor=0.35,
        target_type='center',
        final_length=1700
    )
    
    print("\nBalanced distribution:")
    print(df_balanced["type"].value_counts())
    
    # Parallel text processing
    text_cols = ["text", "topic", "article", "biased_words"]
    chunk_size = len(df_balanced) // NUM_PROCESSES
    chunks = [df_balanced.iloc[i:i + chunk_size] for i in range(0, len(df_balanced), chunk_size)]
    
    with ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:
        processed_chunks = list(executor.map(
            partial(parallel_combine_text, text_cols=text_cols),
            chunks
        ))
    
    df_processed = pd.concat(processed_chunks)
    df_processed["label"] = df_processed["type"].map(LABEL_MAP)
    return df_processed

# ---------------- Metrics ----------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ---------------- Main Pipeline ----------------
def main():
    # Load and prepare data (parallel)
    df = load_and_rebalance(FILE_PATH)
    
    # Train-Test Split
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
    
    # Convert to Dataset objects
    train_ds = Dataset.from_pandas(train_df[["combined_input", "label"]].reset_index(drop=True))
    eval_ds = Dataset.from_pandas(eval_df[["combined_input", "label"]].reset_index(drop=True))

    # Load Tokenizer and Custom Model
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = CustomModelWithAttention(MODEL_NAME, num_labels=3)
    except:
        print(f"Model {MODEL_NAME} not found, using fallback model")
        fallback = "facebook/roberta-hate-speech-dynabench-r4-target"
        tokenizer = AutoTokenizer.from_pretrained(fallback)
        model = CustomModelWithAttention(fallback, num_labels=3)

    # Parallel tokenization
    tokenize_fn = partial(parallel_tokenize, tokenizer=tokenizer)
    train_ds = train_ds.map(tokenize_fn, batched=True, batch_size=1000, num_proc=NUM_PROCESSES)
    eval_ds = eval_ds.map(tokenize_fn, batched=True, batch_size=1000, num_proc=NUM_PROCESSES)
    
    train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    # Training arguments with multiprocessing
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        logging_dir=LOGGING_DIR,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=50,
        save_steps=500,
        evaluation_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        remove_unused_columns=False,
        dataloader_num_workers=NUM_PROCESSES,  # Parallel data loading
        fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU available
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=default_data_collator,
    )

    # Train and save
    trainer.train()
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

    # Parallel prediction
    full_ds = Dataset.from_pandas(df[["combined_input"]].reset_index(drop=True))
    full_ds = full_ds.map(tokenize_fn, batched=True, batch_size=1000, num_proc=NUM_PROCESSES)
    full_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])
    
    preds = trainer.predict(full_ds)
    df["predicted_bias_category"] = [REVERSE_LABEL_MAP[i] for i in torch.argmax(torch.tensor(preds.predictions), dim=1).numpy()]
    df.to_csv(RESULTS_PATH, index=False)

if __name__ == "__main__":
    # Set multiprocessing start method for Unix systems
    multiprocessing.set_start_method('spawn', force=True)
    main()