In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import os
import sys
import logging
import numpy as np
import pandas as pd
import torch
from functools import lru_cache
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import Dataset

In [5]:
!pip uninstall torch torchvision torchaudio

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Would remove:
    /usr/local/bin/torchfrtrace
    /usr/local/bin/torchrun
    /usr/local/lib/python3.11/dist-packages/functorch/*
    /usr/local/lib/python3.11/dist-packages/torch-2.6.0+cu124.dist-info/*
    /usr/local/lib/python3.11/dist-packages/torch/*
    /usr/local/lib/python3.11/dist-packages/torchgen/*
Proceed (Y/n)? y
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/torchvision-0.21.0+cu124.dist-info/*
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libcudart.41118559.so.12
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libjpeg.1c1c4b09.so.8
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libnvjpeg.02b6d700.so.12
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libpng16.0364a1db.so.16
    /usr/local/l

In [8]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
MODEL_NAME = "launch/politics"  # Original specialized political bias model
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/soumya/results"
LOGGING_DIR = "/content/drive/MyDrive/soumya/logs"
RESULTS_PATH = "/content/drive/MyDrive/soumya/results/predictions.csv"
CACHE_DIR = "/content/drive/MyDrive/soumya/results/cache"  # Add cache directory to prevent redownloading
DATASET_CACHE_DIR = "/content/drive/MyDrive/soumya/result/dataset_cache"  # Cache processed datasets

# Ensure directories exist
for directory in [OUTPUT_DIR, LOGGING_DIR, CACHE_DIR, DATASET_CACHE_DIR]:
    os.makedirs(directory, exist_ok=True)

# Set device and optimize memory usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Optimize memory usage based on available hardware
def optimize_memory():
    """Configure memory optimizations based on available hardware."""
    config = {
        "batch_size": 8,
        "eval_batch_size": 16,
        "num_workers": min(os.cpu_count() - 1, 4) if os.cpu_count() > 1 else 0,
        "gradient_accumulation_steps": 2,
        "mixed_precision": False
    }

    if torch.cuda.is_available():
        # Print GPU info
        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
        total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
        logger.info(f"GPU Memory: {total_memory:.2f} GB")

        # Empty GPU cache
        torch.cuda.empty_cache()

        # Adjust settings based on available GPU memory
        if total_memory > 10:  # High-end GPU
            config["batch_size"] = 16
            config["eval_batch_size"] = 32
            config["gradient_accumulation_steps"] = 1
        elif total_memory < 4:  # Low memory GPU
            config["batch_size"] = 4
            config["eval_batch_size"] = 8
            config["gradient_accumulation_steps"] = 4

        config["mixed_precision"] = True

    return config

memory_config = optimize_memory()

# ---------------- Text Processing Functions ----------------
@lru_cache(maxsize=1024)  # Cache preprocessed texts to avoid repeated processing
def preprocess_text(text):
    """Clean and preprocess text."""
    if pd.isna(text) or text is None:
        return ""
    if isinstance(text, list):
        return ' '.join(str(t).lower().strip() for t in text if isinstance(t, str))
    return str(text).lower().strip()

def combine_text(df, text_cols):
    """Combine multiple text columns into one, efficiently."""
    # Process each column once
    processed_cols = {}
    for col in text_cols:
        if col in df.columns:
            processed_cols[col] = df[col].apply(preprocess_text)
        else:
            processed_cols[col] = pd.Series([""] * len(df))

    # Combine processed columns efficiently
    combined_series = processed_cols[text_cols[0]].copy()
    for col in text_cols[1:]:
        # Only concatenate non-empty strings
        mask = processed_cols[col] != ""
        combined_series[mask] = combined_series[mask] + " " + processed_cols[col][mask]

    return combined_series

def create_data_from_batch(batch_df, tokenizer, max_length=128):
    """Process a batch of data into features suitable for model input."""
    texts = batch_df["combined_input"].tolist()

    # Efficient tokenization with padding to max length in batch
    encodings = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    # Create labels tensor if available
    labels = None
    if "label" in batch_df:
        labels = torch.tensor(batch_df["label"].tolist())

    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": labels
    }

# ---------------- Load and Process Data ----------------
def load_data(file_path):
    """Load and perform initial data processing."""
    cache_file = os.path.join(DATASET_CACHE_DIR, f"{os.path.basename(file_path)}.processed.pkl")

    # Check if processed data exists in cache
    if os.path.exists(cache_file):
        logger.info(f"Loading processed data from cache: {cache_file}")
        return pd.read_pickle(cache_file)

    logger.info("Loading and processing data")
    try:
        if file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path, engine='openpyxl')
        elif file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        sys.exit(1)

    # Filter data
    df = df[df["type"].isin(LABEL_MAP.keys())]

    logger.info("Class distribution:")
    logger.info(df["type"].value_counts())

    # Process text columns more efficiently
    text_cols = ["text", "topic", "article", "biased_words"]
    df["combined_input"] = combine_text(df, text_cols)
    df["label"] = df["type"].map(LABEL_MAP)

    # Cache the processed data
    df.to_pickle(cache_file)
    logger.info(f"Processed data cached to: {cache_file}")

    return df

# ---------------- Dataset Classes ----------------
class PoliticalBiasDataset(torch.utils.data.Dataset):
    """Custom dataset for efficient batch processing."""
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.has_labels = "label" in df.columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = {"combined_input": self.df.iloc[idx]["combined_input"]}
        if self.has_labels:
            item["label"] = self.df.iloc[idx]["label"]
        return item

    def collate_fn(self, batch):
        """Custom collate function for efficient batching."""
        batch_df = pd.DataFrame(batch)
        return create_data_from_batch(batch_df, self.tokenizer, self.max_length)

# ---------------- Metrics ----------------
def compute_metrics(eval_pred):
    """Compute evaluation metrics."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    # Detailed report
    class_report = classification_report(labels, preds, target_names=LABEL_MAP.keys(), output_dict=True)

    metrics = {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

    # Add class-specific metrics
    for cls in LABEL_MAP.keys():
        metrics[f"f1_{cls}"] = class_report[cls]['f1-score']

    return metrics

# ---------------- Main Pipeline ----------------
def main(file_path):
    """Main training and evaluation pipeline."""
    logger.info("Starting main pipeline")

    # Load and prepare data
    df = load_data(file_path)

    # Train-Test Split
    logger.info("Splitting data")
    train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df["label"], random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

    logger.info(f"Train set: {len(train_df)} samples")
    logger.info(f"Validation set: {len(val_df)} samples")
    logger.info(f"Test set: {len(test_df)} samples")

    # Load Tokenizer and Model
    logger.info(f"Loading tokenizer and model: {MODEL_NAME}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=len(LABEL_MAP),
            cache_dir=CACHE_DIR,
            ignore_mismatched_sizes=True
        )
        model.config.pad_token_id = tokenizer.pad_token_id
    except Exception as e:
        logger.error(f"Error loading model {MODEL_NAME}: {e}")
        logger.info("Attempting to use fallback model")
        fallback_model = "distilbert-base-uncased"
        tokenizer = AutoTokenizer.from_pretrained(fallback_model, cache_dir=CACHE_DIR)
        model = AutoModelForSequenceClassification.from_pretrained(
            fallback_model,
            num_labels=len(LABEL_MAP),
            cache_dir=CACHE_DIR
        )
        model.config.pad_token_id = tokenizer.pad_token_id
        logger.info(f"Fallback model {fallback_model} loaded successfully")

    # Move model to device
    model.to(device)

    # Create custom datasets for memory efficiency
    train_dataset = PoliticalBiasDataset(train_df, tokenizer)
    val_dataset = PoliticalBiasDataset(val_df, tokenizer)
    test_dataset = PoliticalBiasDataset(test_df, tokenizer)

    # Training Arguments
    logger.info("Setting up training arguments")
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        logging_dir=LOGGING_DIR,
        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        per_device_train_batch_size=memory_config["batch_size"],
        per_device_eval_batch_size=memory_config["eval_batch_size"],
        gradient_accumulation_steps=memory_config["gradient_accumulation_steps"],
        num_train_epochs=5,
        learning_rate=2e-5,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=2,
        report_to="none",
        fp16=memory_config["mixed_precision"],
        dataloader_num_workers=memory_config["num_workers"],
        remove_unused_columns=False,  # We handle column selection in our dataset
        disable_tqdm=False  # Show progress bar
    )

    # Custom data collator using our dataset's collate function
    def collate_fn(batch):
        return train_dataset.collate_fn(batch)

    # Initialize Trainer
    logger.info("Initializing Trainer")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=collate_fn,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train
    logger.info("Starting training")
    trainer.train()

    # Save model and tokenizer
    logger.info("Saving model and tokenizer")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    logger.info(f"Model and tokenizer saved to {OUTPUT_DIR}")

    # Evaluate on test set
    logger.info("Evaluating on test set")
    test_results = trainer.evaluate(test_dataset)
    logger.info(f"Test results: {test_results}")

    # Make predictions in batches for memory efficiency
    logger.info("Making predictions on all data")
    full_dataset = PoliticalBiasDataset(df[["combined_input"]], tokenizer)

    # Create a DataLoader for batch processing predictions
    prediction_dataloader = torch.utils.data.DataLoader(
        full_dataset,
        batch_size=memory_config["eval_batch_size"],
        collate_fn=full_dataset.collate_fn,
        num_workers=memory_config["num_workers"],
        shuffle=False
    )

    # Make predictions in batches
    all_predictions = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(prediction_dataloader, desc="Predicting"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_predictions.extend(batch_preds)

    # Save predictions
    df["predicted_bias_category"] = [REVERSE_LABEL_MAP[i] for i in all_predictions]
    df.to_csv(RESULTS_PATH, index=False)
    logger.info(f"Predictions saved to {RESULTS_PATH}")

    # Clean up GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return test_results

if __name__ == "__main__":
    input_file_path = "/content/drive/MyDrive/soumya/complete_balanced_data.csv"
    results = main(input_file_path)
    print(f"Final results: {results}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at launch/politics and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Left,F1 Center,F1 Right
100,1.0435,0.924291,0.572584,0.548423,0.599522,0.572584,0.384361,0.699527,0.561381
200,0.7832,0.713777,0.689655,0.684144,0.701784,0.689655,0.56483,0.801613,0.685988
300,0.614,0.51983,0.797361,0.797069,0.797081,0.797361,0.754173,0.870588,0.766445
400,0.5426,0.451299,0.832354,0.828858,0.842737,0.832354,0.781986,0.86802,0.83657
500,0.42,0.354838,0.873904,0.871937,0.880865,0.873904,0.836094,0.908455,0.871263
600,0.3315,0.275846,0.907876,0.907486,0.908493,0.907876,0.889524,0.928124,0.904812
700,0.3589,0.278915,0.907961,0.906955,0.914575,0.907961,0.874141,0.94931,0.897415
800,0.2449,0.215478,0.934951,0.934743,0.935652,0.934951,0.928516,0.94668,0.929034
900,0.2282,0.151189,0.955981,0.955877,0.956363,0.955981,0.948852,0.96338,0.955399
1000,0.2435,0.145883,0.962112,0.962121,0.962167,0.962112,0.957074,0.967791,0.961499


Predicting: 100%|██████████| 2447/2447 [02:09<00:00, 18.95it/s]


Final results: {'eval_loss': 0.05405449494719505, 'eval_accuracy': 0.9891868880374628, 'eval_f1': 0.9891894577044392, 'eval_precision': 0.9892285941578098, 'eval_recall': 0.9891868880374628, 'eval_f1_left': 0.9905346635968278, 'eval_f1_center': 0.9880650076180802, 'eval_f1_right': 0.9889687018984095, 'eval_runtime': 18.7003, 'eval_samples_per_second': 628.065, 'eval_steps_per_second': 19.679, 'epoch': 0.5545826036193812}


### Test the model

In [9]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

In [11]:

model_path = "/content/drive/MyDrive/soumya/results"
tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
test_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

samples = [
["""As wildfires rage across California, floods displace thousands in the Midwest, and heatwaves scorch cities from Texas to New York, the evidence is undeniable: the climate crisis is no longer a distant threat—it’s here. And yet, as communities suffer and ecosystems collapse, fossil fuel corporations continue to post record-breaking profits, protected by conservative politicians and a global system rigged in their favor.
In 2024 alone, the five largest oil companies reported over $200 billion in profits. Instead of investing in renewable energy or helping vulnerable communities transition to a green economy, these corporations funneled billions into stock buybacks and executive bonuses. Their message is clear: profits come before people, and the planet can burn so long as the shareholders stay rich.
Even more alarming is the political shielding they receive from right-wing lawmakers, many of whom deny climate science altogether. Republican leaders in Congress have repeatedly blocked climate legislation, gutted the Environmental Protection Agency’s regulatory powers, and prioritized drilling permits over clean air and water.
Meanwhile, climate activists—many of them youth, Indigenous leaders, and marginalized communities—continue to face police repression, surveillance, and criminalization. Peaceful protesters at pipeline sites are arrested, while oil spills and environmental destruction go unpunished.
We need a Green New Deal-level transformation: bold investments in wind, solar, and green infrastructure; the creation of millions of unionized green jobs; and climate reparations for communities hit hardest by pollution and environmental racism.
The time for delay is over. The time to act is now."""
],
["""The United States thrives when government steps back and lets free enterprise lead. In recent years, however, progressive lawmakers have increasingly pushed for regulation, redistribution, and intervention that stifles innovation and discourages hard work.
From overreaching environmental mandates to government-controlled healthcare proposals, the left continues to champion policies that prioritize bureaucracy over results. These moves are not only anti-business—they’re anti-American.
America's economic engine runs best when the private sector is free to create, compete, and grow. Small business owners across the country are already struggling with inflation and labor shortages—problems worsened by excessive government interference and rising taxes.
We must return to policies that reward productivity, protect property rights, and uphold free-market values. Deregulation, tax reform, and energy independence will not only restore our economy—they’ll renew our national spirit.
"""],
["""As artificial intelligence tools become increasingly integrated into everyday life—from health diagnostics to criminal justice systems—Democratic and Republican lawmakers alike are recognizing the need for clear regulatory frameworks.
A bipartisan group in Congress recently introduced the American AI Responsibility Act, aiming to address transparency, data privacy, and algorithmic bias. While the bill doesn’t go as far as some activists demand, it marks an important step toward balancing innovation with accountability.
Tech CEOs have expressed cautious support, stating that some regulation is needed to maintain public trust, but they warn against overregulation that could drive development offshore.
Experts agree: regulation must be careful, measured, and informed by the science—not by political theater. While divisions remain, the shared concern over AI’s risks may offer a rare opportunity for consensus in Washington.
"""],
["""In yet another blow to working-class Americans, Senate Republicans have blocked legislation that would raise the federal minimum wage to $17 per hour by 2027. With wages stagnant and inflation hitting food, rent, and transportation costs, the move is being widely condemned by labor leaders and economists.
The current $7.25 minimum wage has not been raised since 2009, despite historic gains in productivity and corporate profits. Over 60% of Americans support a raise, but Republican lawmakers claim it would “hurt small businesses”—an argument that many economists say is overblown.
In reality, the refusal to raise wages preserves exploitative systems where billion-dollar corporations rely on underpaid workers while CEO salaries skyrocket.
This is not just about economics—it’s about dignity. Every American who works full-time should be able to afford basic necessities. Congress’s failure to act is a moral failure, and it’s up to voters to hold them accountable."""],
 ["""The southern border has long been a flashpoint in American politics, but recent data shows that tougher enforcement and advanced surveillance technology are yielding results. Illegal crossings dropped 30% in the first quarter of 2025 compared to the previous year, according to Homeland Security reports.
Under the new measures, authorities have deployed AI-powered drones, reinforced border fencing, and accelerated asylum screening procedures. Critics on the left say the policies are “inhumane,” but officials argue they are necessary to protect national sovereignty and public safety.
Drug seizures have also increased, particularly fentanyl shipments originating from cartels that exploit weak border points. Law enforcement agencies say the new tools and funding are making a significant impact.
The Biden administration was slow to act early in its term, but this policy shift marks a necessary correction. The right to immigrate must be balanced with the rule of law—and American citizens deserve to feel safe and secure in their own country.
"""]
]

for text in samples:
    output = test_pipeline(text, truncation=True, max_length=512)
    sorted_output = sorted(output[0], key=lambda x: x["score"], reverse=True)
    top_label = sorted_output[0]

    label_map = {
    0: "left", 1: "center", 2: "right"}
    # Extract the numeric part of the label like 'LABEL_1' -> 1
    label_index = int(top_label['label'].split('_')[-1])
    readable_label = label_map[label_index]

    print(f"\nText: {text}")
    print(f"Predicted Bias: {readable_label} \n confidence score:({top_label['score']:.4f})")

Device set to use cuda:0



Text: ['As wildfires rage across California, floods displace thousands in the Midwest, and heatwaves scorch cities from Texas to New York, the evidence is undeniable: the climate crisis is no longer a distant threat—it’s here. And yet, as communities suffer and ecosystems collapse, fossil fuel corporations continue to post record-breaking profits, protected by conservative politicians and a global system rigged in their favor.\nIn 2024 alone, the five largest oil companies reported over $200 billion in profits. Instead of investing in renewable energy or helping vulnerable communities transition to a green economy, these corporations funneled billions into stock buybacks and executive bonuses. Their message is clear: profits come before people, and the planet can burn so long as the shareholders stay rich.\nEven more alarming is the political shielding they receive from right-wing lawmakers, many of whom deny climate science altogether. Republican leaders in Congress have repeatedly b

In [None]:
import logging
import os
import sys
import re
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from functools import lru_cache
from nltk.stem import WordNetLemmatizer

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
MODEL_NAME = "launch/politics"  # POLITICS model
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/soumya/results"
LOGGING_DIR = "/content/drive/MyDrive/soumya/logs"
RESULTS_PATH = "/content/drive/MyDrive/soumya/results/predictions.csv"
CACHE_DIR = "/content/drive/MyDrive/soumya/results/cache"
DATASET_CACHE_DIR = "/content/drive/MyDrive/soumya/result/dataset_cache"
MAX_LENGTH = 128

# Ensure directories exist
for directory in [OUTPUT_DIR, LOGGING_DIR, CACHE_DIR, DATASET_CACHE_DIR]:
    os.makedirs(directory, exist_ok=True)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Optimize memory usage
def optimize_memory():
    config = {
        "batch_size": 8,
        "eval_batch_size": 16,
        "num_workers": min(os.cpu_count() - 1, 4) if os.cpu_count() > 1 else 0,
        "gradient_accumulation_steps": 2,
        "mixed_precision": False
    }
    if torch.cuda.is_available():
        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
        total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
        logger.info(f"GPU Memory: {total_memory:.2f} GB")
        torch.cuda.empty_cache()
        if total_memory > 10:
            config["batch_size"] = 16
            config["eval_batch_size"] = 32
            config["gradient_accumulation_steps"] = 1
        elif total_memory < 4:
            config["batch_size"] = 4
            config["eval_batch_size"] = 8
            config["gradient_accumulation_steps"] = 4
        config["mixed_precision"] = True
    return config

memory_config = optimize_memory()

# Text Processing Functions
lemmatizer = WordNetLemmatizer()

@lru_cache(maxsize=1024)
def preprocess_text(text):
    if pd.isna(text) or text is None:
        return ""
    text = str(text).lower().strip()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

def combine_text(df, text_cols):
    available_cols = [col for col in text_cols if col in df.columns]
    if not available_cols:
        logger.error("No valid text columns found")
        sys.exit(1)
    processed_cols = {col: df[col].apply(preprocess_text) if col in df.columns else pd.Series([""] * len(df)) for col in text_cols}
    combined_series = processed_cols[text_cols[0]].copy()
    for col in text_cols[1:]:
        mask = processed_cols[col] != ""
        combined_series[mask] = combined_series[mask] + " " + processed_cols[col][mask]
    return combined_series

def create_data_from_batch(batch_df, tokenizer, max_length=MAX_LENGTH):
    texts = batch_df["combined_input"].tolist()
    encodings = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    labels = None
    if "label" in batch_df:
        labels = torch.tensor(batch_df["label"].tolist())
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": labels
    }

# Load and Process Data
def load_data(file_path):
    cache_file = os.path.join(DATASET_CACHE_DIR, f"{os.path.basename(file_path)}.processed.pkl")
    if os.path.exists(cache_file):
        logger.info(f"Loading processed data from cache: {cache_file}")
        return pd.read_pickle(cache_file)
    
    logger.info("Loading and processing data")
    if not os.path.exists(file_path):
        logger.error(f"Input file not found: {file_path}")
        sys.exit(1)
    
    try:
        if file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path, engine='openpyxl')
        elif file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        sys.exit(1)
    
    required_cols = ["text", "type"]
    if not all(col in df.columns for col in required_cols):
        logger.error(f"Missing required columns: {required_cols}")
        sys.exit(1)
    
    df = df[df["type"].isin(LABEL_MAP.keys())]
    logger.info("Class distribution:")
    logger.info(df["type"].value_counts())
    
    text_cols = ["text", "topic", "article", "biased_words"]
    df["combined_input"] = combine_text(df, text_cols)
    df["label"] = df["type"].map(LABEL_MAP)
    
    logger.info(f"Sample data:\n{df[['combined_input', 'type']].head(5).to_string()}")
    df.to_pickle(cache_file)
    logger.info(f"Processed data cached to: {cache_file}")
    
    return df

# Dataset Class
class PoliticalBiasDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=MAX_LENGTH):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.has_labels = "label" in df.columns
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        item = {"combined_input": self.df.iloc[idx]["combined_input"]}
        if self.has_labels:
            item["label"] = self.df.iloc[idx]["label"]
        return item
    
    def collate_fn(self, batch):
        batch_df = pd.DataFrame(batch)
        return create_data_from_batch(batch_df, self.tokenizer, self.max_length)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    class_report = classification_report(labels, preds, target_names=LABEL_MAP.keys(), output_dict=True)
    
    # Confusion matrix
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=LABEL_MAP.keys(), yticklabels=LABEL_MAP.keys())
    plt.xlabel('Predicted')
    plt.ylabel('True')
    cm_path = os.path.join(OUTPUT_DIR, 'confusion_matrix.png')
    plt.savefig(cm_path)
    plt.close()
    logger.info(f"Confusion matrix saved to {cm_path}")
    
    metrics = {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }
    for cls in LABEL_MAP.keys():
        metrics[f"f1_{cls}"] = class_report[cls]['f1-score']
    
    logger.info(f"Classification Report:\n{classification_report(labels, preds, target_names=LABEL_MAP.keys())}")
    return metrics

# Custom Trainer with Class Weights
class CustomTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Main Pipeline
def main(file_path):
    logger.info("Starting main pipeline")
    
    # Load and prepare data
    df = load_data(file_path)
    
    # Train-Test Split
    logger.info("Splitting data")
    train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df["label"], random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)
    
    logger.info(f"Train set: {len(train_df)} samples")
    logger.info(f"Validation set: {len(val_df)} samples")
    logger.info(f"Test set: {len(test_df)} samples")
    
    # Load Tokenizer and Model
    logger.info(f"Loading tokenizer and model: {MODEL_NAME}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=len(LABEL_MAP),
            cache_dir=CACHE_DIR,
            ignore_mismatched_sizes=True
        )
        model.config.pad_token_id = tokenizer.pad_token_id
        logger.info(f"Successfully loaded model: {MODEL_NAME}")
    except Exception as e:
        logger.error(f"Failed to load model {MODEL_NAME}: {e}")
        raise
    
    model.to(device)
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_df["label"]), y=train_df["label"])
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
    logger.info(f"Class weights: {class_weights.tolist()}")
    
    # Create datasets
    train_dataset = PoliticalBiasDataset(train_df, tokenizer)
    val_dataset = PoliticalBiasDataset(val_df, tokenizer)
    test_dataset = PoliticalBiasDataset(test_df, tokenizer)
    
    # Training Arguments
    logger.info("Setting up training arguments")
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        logging_dir=LOGGING_DIR,
        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        per_device_train_batch_size=memory_config["batch_size"],
        per_device_eval_batch_size=memory_config["eval_batch_size"],
        gradient_accumulation_steps=memory_config["gradient_accumulation_steps"],
        num_train_epochs=10,
        learning_rate=2e-5,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=2,
        report_to="none",
        fp16=memory_config["mixed_precision"],
        dataloader_num_workers=memory_config["num_workers"],
        remove_unused_columns=False,
        disable_tqdm=False,
        gradient_checkpointing=True,
        lr_scheduler_type="linear"
    )
    
    # Initialize Trainer
    logger.info("Initializing Trainer")
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=train_dataset.collate_fn,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        class_weights=class_weights
    )
    
    # Train
    logger.info("Starting training")
    trainer.train()
    trainer.save_metrics("all", trainer.metrics)
    logger.info(f"Training metrics saved to {LOGGING_DIR}")
    
    # Save model and tokenizer
    logger.info("Saving model and tokenizer")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    logger.info(f"Model and tokenizer saved to {OUTPUT_DIR}")
    
    # Evaluate on test set
    logger.info("Evaluating on test set")
    test_results = trainer.evaluate(test_dataset)
    logger.info(f"Test results: {test_results}")
    for metric, value in test_results.items():
        logger.info(f"{metric}: {value:.4f}")
    
    # Make predictions
    logger.info("Making predictions on all data")
    full_dataset = PoliticalBiasDataset(df[["combined_input"]], tokenizer)
    prediction_dataloader = DataLoader(
        full_dataset,
        batch_size=memory_config["eval_batch_size"],
        collate_fn=full_dataset.collate_fn,
        num_workers=memory_config["num_workers"],
        shuffle=False
    )
    
    all_predictions = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(prediction_dataloader, desc="Predicting"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_predictions.extend(batch_preds)
    
    # Save predictions
    df["predicted_bias_category"] = [REVERSE_LABEL_MAP[i] for i in all_predictions]
    df.to_csv(RESULTS_PATH, index=False)
    logger.info(f"Predictions saved to {RESULTS_PATH}")
    
    # Save misclassified examples
    misclassified = df[df["label"] != df["predicted_bias_category"].map(LABEL_MAP)]
    misclassified_path = os.path.join(OUTPUT_DIR, "misclassified_examples.csv")
    misclassified.to_csv(misclassified_path, index=False)
    logger.info(f"Misclassified examples saved to {misclassified_path}")
    
    # Clean up GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return test_results

if __name__ == "__main__":
    input_file_path = "/content/drive/MyDrive/soumya/complete_balanced_data.csv"
    results = main(input_file_path)
    print(f"Final results: {results}")