In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import logging
import os
import sys
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
from tqdm import tqdm
from functools import lru_cache
from sklearn.utils.class_weight import compute_class_weight
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
MODEL_NAME = "roberta-base"  # Upgraded to a larger model
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/coding/results"
LOGGING_DIR = "/content/drive/MyDrive/coding/logs"
RESULTS_PATH = "/content/drive/MyDrive/coding/results/predictions.csv"
CACHE_DIR = "/content/drive/MyDrive/coding/results/cache"
DATASET_CACHE_DIR = "/content/drive/MyDrive/coding/result/dataset_cache"
# Ensure directories exist
for directory in [OUTPUT_DIR, LOGGING_DIR, CACHE_DIR, DATASET_CACHE_DIR]:
    os.makedirs(directory, exist_ok=True)

# Set device and optimize memory usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Optimize memory usage based on available hardware
def optimize_memory():
    config = {
        "batch_size": 8,
        "eval_batch_size": 16,
        "num_workers": min(os.cpu_count() - 1, 4) if os.cpu_count() > 1 else 0,
        "gradient_accumulation_steps": 2,
        "mixed_precision": False
    }

    if torch.cuda.is_available():
        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
        total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
        logger.info(f"GPU Memory: {total_memory:.2f} GB")
        torch.cuda.empty_cache()

        if total_memory > 10:
            config["batch_size"] = 16
            config["eval_batch_size"] = 32
            config["gradient_accumulation_steps"] = 1
        elif total_memory < 4:
            config["batch_size"] = 4
            config["eval_batch_size"] = 8
            config["gradient_accumulation_steps"] = 4

        config["mixed_precision"] = True

    return config

memory_config = optimize_memory()

# ---------------- Text Processing Functions ----------------
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

@lru_cache(maxsize=1024)
def preprocess_text(text):
    if pd.isna(text) or text is None:
        return ""
    if isinstance(text, list):
        text = ' '.join(str(t).lower().strip() for t in text if isinstance(t, str))
    text = str(text).lower().strip()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

def combine_text(df, text_cols, weights=None):
    if weights is None:
        weights = {"text": 1.0, "topic": 0.5, "article": 0.8, "biased_words": 2.0}
    processed_cols = {}
    for col in text_cols:
        if col in df.columns:
            processed_cols[col] = df[col].apply(preprocess_text)
        else:
            processed_cols[col] = pd.Series([""] * len(df))

    combined_series = processed_cols[text_cols[0]].copy()
    for col in text_cols[1:]:
        mask = processed_cols[col] != ""
        weighted_text = processed_cols[col][mask].apply(lambda x: (x + " ") * int(weights[col])).str.strip()
        combined_series[mask] = combined_series[mask] + " " + weighted_text

    return combined_series

def create_data_from_batch(batch_df, tokenizer, max_length=256):
    texts = batch_df["combined_input"].tolist()
    encodings = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    labels = None
    if "label" in batch_df:
        labels = torch.tensor(batch_df["label"].tolist())
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": labels
    }

# ---------------- Load and Process Data ----------------
def load_data(file_path):
    cache_file = os.path.join(DATASET_CACHE_DIR, f"{os.path.basename(file_path)}.processed.pkl")
    if os.path.exists(cache_file):
        logger.info(f"Loading processed data from cache: {cache_file}")
        return pd.read_pickle(cache_file)

    logger.info("Loading and processing data")
    try:
        if file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path, engine='openpyxl')
        elif file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        sys.exit(1)

    df = df[df["type"].isin(LABEL_MAP.keys())]
    logger.info("Class distribution:")
    logger.info(df["type"].value_counts())

    text_cols = ["text", "topic", "article", "biased_words"]
    df["combined_input"] = combine_text(df, text_cols)
    df["label"] = df["type"].map(LABEL_MAP)

    df.to_pickle(cache_file)
    logger.info(f"Processed data cached to: {cache_file}")

    return df

# ---------------- Dataset Classes ----------------
class PoliticalBiasDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.has_labels = "label" in df.columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = {"combined_input": self.df.iloc[idx]["combined_input"]}
        if self.has_labels:
            item["label"] = self.df.iloc[idx]["label"]
        return item

    def collate_fn(self, batch):
        batch_df = pd.DataFrame(batch)
        return create_data_from_batch(batch_df, self.tokenizer, self.max_length)

# ---------------- Metrics ----------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    class_report = classification_report(labels, preds, target_names=LABEL_MAP.keys(), output_dict=True)
    metrics = {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }
    for cls in LABEL_MAP.keys():
        metrics[f"f1_{cls}"] = class_report[cls]['f1-score']
    return metrics

# ---------------- Main Pipeline ----------------
def main(file_path):
    logger.info("Starting main pipeline")
    df = load_data(file_path)

    logger.info("Splitting data")
    train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df["label"], random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

    logger.info(f"Train set: {len(train_df)} samples")
    logger.info(f"Validation set: {len(val_df)} samples")
    logger.info(f"Test set: {len(test_df)} samples")

    logger.info(f"Loading tokenizer and model: {MODEL_NAME}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=len(LABEL_MAP),
            cache_dir=CACHE_DIR,
            ignore_mismatched_sizes=True
        )
        model.config.pad_token_id = tokenizer.pad_token_id
    except Exception as e:
        logger.error(f"Error loading model {MODEL_NAME}: {e}")
        sys.exit(1)

    model.to(device)

    train_dataset = PoliticalBiasDataset(train_df, tokenizer, max_length=256)
    val_dataset = PoliticalBiasDataset(val_df, tokenizer, max_length=256)
    test_dataset = PoliticalBiasDataset(test_df, tokenizer, max_length=256)

    # Compute class weights for imbalanced data
    class_weights = compute_class_weight("balanced", classes=np.array([0, 1, 2]), y=train_df["label"])
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

    logger.info("Setting up training arguments")
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        logging_dir=LOGGING_DIR,
        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        per_device_train_batch_size=memory_config["batch_size"],
        per_device_eval_batch_size=memory_config["eval_batch_size"],
        gradient_accumulation_steps=memory_config["gradient_accumulation_steps"],
        num_train_epochs=10,
        learning_rate=3e-5,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=2,
        report_to="none",
        fp16=memory_config["mixed_precision"],
        dataloader_num_workers=memory_config["num_workers"],
        remove_unused_columns=False,
        disable_tqdm=False
    )

    def collate_fn(batch):
        return train_dataset.collate_fn(batch)

    # Custom Trainer with class weights, updated to handle num_items_in_batch
    class WeightedTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get("logits")
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
            loss = loss_fct(logits, labels)
            return (loss, outputs) if return_outputs else loss

    logger.info("Initializing Trainer")
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=collate_fn,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    logger.info("Starting training")
    trainer.train()

    logger.info("Saving model and tokenizer")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    logger.info(f"Model and tokenizer saved to {OUTPUT_DIR}")

    logger.info("Evaluating on test set")
    test_results = trainer.evaluate(test_dataset)
    logger.info(f"Test results: {test_results}")

    logger.info("Making predictions on all data")
    full_dataset = PoliticalBiasDataset(df[["combined_input"]], tokenizer, max_length=256)
    prediction_dataloader = torch.utils.data.DataLoader(
        full_dataset,
        batch_size=memory_config["eval_batch_size"],
        collate_fn=full_dataset.collate_fn,
        num_workers=memory_config["num_workers"],
        shuffle=False
    )

    all_predictions = []
    all_confidences = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(prediction_dataloader, desc="Predicting"):
            input_ids = batch["input_ids"].to(device)
            permission_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=permission_mask)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            batch_confidences, batch_preds = torch.max(probabilities, dim=1)
            all_predictions.extend(batch_preds.cpu().numpy())
            all_confidences.extend(batch_confidences.cpu().numpy())

    df["predicted_bias_category"] = [REVERSE_LABEL_MAP[i] for i in all_predictions]
    df["confidence_score"] = all_confidences
    df["is_confident"] = df["confidence_score"] >= 0.95
    df.to_csv(RESULTS_PATH, index=False)
    logger.info(f"Predictions with confidence scores saved to {RESULTS_PATH}")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return test_results

if __name__ == "__main__":
    input_file_path = "/content/drive/MyDrive/coding/complete_balanced_data.csv"
    results = main(input_file_path)
    print(f"Final results: {results}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Left,F1 Center,F1 Right
100,1.0293,0.973672,0.481822,0.378991,0.510414,0.481822,0.006098,0.63854,0.492334
200,0.9099,0.842196,0.573776,0.532541,0.557697,0.573776,0.236244,0.753892,0.607487
300,0.8229,0.746254,0.631928,0.583247,0.642335,0.631928,0.280103,0.8076,0.662037
400,0.8613,0.678825,0.71324,0.705823,0.715577,0.71324,0.56806,0.845971,0.703439
500,0.6611,0.623676,0.741677,0.731955,0.741323,0.741677,0.625165,0.835651,0.735049
600,0.5271,0.582055,0.781609,0.775785,0.782022,0.781609,0.692275,0.858733,0.776346
700,0.541,0.550824,0.783397,0.771011,0.802817,0.783397,0.648311,0.832081,0.83264
800,0.3918,0.356931,0.865475,0.863388,0.866439,0.865475,0.799172,0.935624,0.855369
900,0.356,0.340595,0.884632,0.883449,0.885966,0.884632,0.840873,0.921381,0.888093
1000,0.3805,0.281326,0.902597,0.901385,0.90324,0.902597,0.861767,0.933252,0.909137


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Left,F1 Center,F1 Right
100,1.0293,0.973672,0.481822,0.378991,0.510414,0.481822,0.006098,0.63854,0.492334
200,0.9099,0.842196,0.573776,0.532541,0.557697,0.573776,0.236244,0.753892,0.607487
300,0.8229,0.746254,0.631928,0.583247,0.642335,0.631928,0.280103,0.8076,0.662037
400,0.8613,0.678825,0.71324,0.705823,0.715577,0.71324,0.56806,0.845971,0.703439
500,0.6611,0.623676,0.741677,0.731955,0.741323,0.741677,0.625165,0.835651,0.735049
600,0.5271,0.582055,0.781609,0.775785,0.782022,0.781609,0.692275,0.858733,0.776346
700,0.541,0.550824,0.783397,0.771011,0.802817,0.783397,0.648311,0.832081,0.83264
800,0.3918,0.356931,0.865475,0.863388,0.866439,0.865475,0.799172,0.935624,0.855369
900,0.356,0.340595,0.884632,0.883449,0.885966,0.884632,0.840873,0.921381,0.888093
1000,0.3805,0.281326,0.902597,0.901385,0.90324,0.902597,0.861767,0.933252,0.909137


Predicting: 100%|██████████| 2447/2447 [04:12<00:00,  9.68it/s]


Final results: {'eval_loss': 0.05242487043142319, 'eval_accuracy': 0.9873988931460196, 'eval_f1': 0.987404381923188, 'eval_precision': 0.9874708848884522, 'eval_recall': 0.9873988931460196, 'eval_f1_left': 0.9878064433320498, 'eval_f1_center': 0.9859333417817767, 'eval_f1_right': 0.9884733606557377, 'eval_runtime': 38.7448, 'eval_samples_per_second': 303.137, 'eval_steps_per_second': 9.498, 'epoch': 0.99241097489784}


### Test roberta model

In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

In [None]:
model_path = "/content/drive/MyDrive/coding/results"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
test_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

samples = [
["""As wildfires rage across California, floods displace thousands in the Midwest, and heatwaves scorch cities from Texas to New York, the evidence is undeniable: the climate crisis is no longer a distant threat—it’s here. And yet, as communities suffer and ecosystems collapse, fossil fuel corporations continue to post record-breaking profits, protected by conservative politicians and a global system rigged in their favor.
In 2024 alone, the five largest oil companies reported over $200 billion in profits. Instead of investing in renewable energy or helping vulnerable communities transition to a green economy, these corporations funneled billions into stock buybacks and executive bonuses. Their message is clear: profits come before people, and the planet can burn so long as the shareholders stay rich.
Even more alarming is the political shielding they receive from right-wing lawmakers, many of whom deny climate science altogether. Republican leaders in Congress have repeatedly blocked climate legislation, gutted the Environmental Protection Agency’s regulatory powers, and prioritized drilling permits over clean air and water.
Meanwhile, climate activists—many of them youth, Indigenous leaders, and marginalized communities—continue to face police repression, surveillance, and criminalization. Peaceful protesters at pipeline sites are arrested, while oil spills and environmental destruction go unpunished.
We need a Green New Deal-level transformation: bold investments in wind, solar, and green infrastructure; the creation of millions of unionized green jobs; and climate reparations for communities hit hardest by pollution and environmental racism.
The time for delay is over. The time to act is now."""
],
["""The United States thrives when government steps back and lets free enterprise lead. In recent years, however, progressive lawmakers have increasingly pushed for regulation, redistribution, and intervention that stifles innovation and discourages hard work.
From overreaching environmental mandates to government-controlled healthcare proposals, the left continues to champion policies that prioritize bureaucracy over results. These moves are not only anti-business—they’re anti-American.
America's economic engine runs best when the private sector is free to create, compete, and grow. Small business owners across the country are already struggling with inflation and labor shortages—problems worsened by excessive government interference and rising taxes.
We must return to policies that reward productivity, protect property rights, and uphold free-market values. Deregulation, tax reform, and energy independence will not only restore our economy—they’ll renew our national spirit.
"""],
["""As artificial intelligence tools become increasingly integrated into everyday life—from health diagnostics to criminal justice systems—Democratic and Republican lawmakers alike are recognizing the need for clear regulatory frameworks.
A bipartisan group in Congress recently introduced the American AI Responsibility Act, aiming to address transparency, data privacy, and algorithmic bias. While the bill doesn’t go as far as some activists demand, it marks an important step toward balancing innovation with accountability.
Tech CEOs have expressed cautious support, stating that some regulation is needed to maintain public trust, but they warn against overregulation that could drive development offshore.
Experts agree: regulation must be careful, measured, and informed by the science—not by political theater. While divisions remain, the shared concern over AI’s risks may offer a rare opportunity for consensus in Washington.
"""],
["""In yet another blow to working-class Americans, Senate Republicans have blocked legislation that would raise the federal minimum wage to $17 per hour by 2027. With wages stagnant and inflation hitting food, rent, and transportation costs, the move is being widely condemned by labor leaders and economists.
The current $7.25 minimum wage has not been raised since 2009, despite historic gains in productivity and corporate profits. Over 60% of Americans support a raise, but Republican lawmakers claim it would “hurt small businesses”—an argument that many economists say is overblown.
In reality, the refusal to raise wages preserves exploitative systems where billion-dollar corporations rely on underpaid workers while CEO salaries skyrocket.
This is not just about economics—it’s about dignity. Every American who works full-time should be able to afford basic necessities. Congress’s failure to act is a moral failure, and it’s up to voters to hold them accountable."""],
 ["""The southern border has long been a flashpoint in American politics, but recent data shows that tougher enforcement and advanced surveillance technology are yielding results. Illegal crossings dropped 30% in the first quarter of 2025 compared to the previous year, according to Homeland Security reports.
Under the new measures, authorities have deployed AI-powered drones, reinforced border fencing, and accelerated asylum screening procedures. Critics on the left say the policies are “inhumane,” but officials argue they are necessary to protect national sovereignty and public safety.
Drug seizures have also increased, particularly fentanyl shipments originating from cartels that exploit weak border points. Law enforcement agencies say the new tools and funding are making a significant impact.
The Biden administration was slow to act early in its term, but this policy shift marks a necessary correction. The right to immigrate must be balanced with the rule of law—and American citizens deserve to feel safe and secure in their own country.
"""]
]

for text in samples:
    output = test_pipeline(text, truncation=True, max_length=512)
    sorted_output = sorted(output[0], key=lambda x: x["score"], reverse=True)
    top_label = sorted_output[0]

    label_map = {
    0: "left", 1: "center", 2: "right"}
    # Extract the numeric part of the label like 'LABEL_1' -> 1
    label_index = int(top_label['label'].split('_')[-1])
    readable_label = label_map[label_index]

    print(f"\nText: {text}")
    print(f"Predicted Bias: {readable_label} \n confidence score:({top_label['score']:.4f})")

Device set to use cuda:0



Text: ['As wildfires rage across California, floods displace thousands in the Midwest, and heatwaves scorch cities from Texas to New York, the evidence is undeniable: the climate crisis is no longer a distant threat—it’s here. And yet, as communities suffer and ecosystems collapse, fossil fuel corporations continue to post record-breaking profits, protected by conservative politicians and a global system rigged in their favor.\nIn 2024 alone, the five largest oil companies reported over $200 billion in profits. Instead of investing in renewable energy or helping vulnerable communities transition to a green economy, these corporations funneled billions into stock buybacks and executive bonuses. Their message is clear: profits come before people, and the planet can burn so long as the shareholders stay rich.\nEven more alarming is the political shielding they receive from right-wing lawmakers, many of whom deny climate science altogether. Republican leaders in Congress have repeatedly b

## Improved version with openNLP


In [6]:
import logging
import os
import sys
import re
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from functools import lru_cache
import spacy
from torch import nn
import optuna



In [5]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [7]:

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
MODEL_NAME = "launch/politics"
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/coding/bias_results"
LOGGING_DIR = "/content/drive/MyDrive/coding/bia_logs"
RESULTS_PATH = "/content/drive/MyDrive/coding/bias_results/predictions.csv"
CACHE_DIR = "/content/drive/MyDrive/coding/bias_results/cache"
DATASET_CACHE_DIR = "/content/drive/MyDrive/coding/bias_result/dataset_cache"
MAX_LENGTH = 128
PRELEMMATIZE = True
RUN_HYPERPARAMETER_TUNING = True
N_OPTUNA_TRIALS = 10

# Ensure directories exist
for directory in [OUTPUT_DIR, LOGGING_DIR, CACHE_DIR, DATASET_CACHE_DIR]:
    os.makedirs(directory, exist_ok=True)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Initialize spaCy
try:
    nlp_spacy = spacy.load("en_core_web_sm", disable=["ner", "parser"])
    logger.info("spaCy initialized with en_core_web_sm")
except Exception as e:
    logger.error(f"Failed to load spaCy model: {e}. Install with: python -m spacy download en_core_web_sm")
    sys.exit(1)

# Optimize memory usage
def optimize_memory():
    config = {
        "batch_size": 8,
        "eval_batch_size": 16,
        "num_workers": min(os.cpu_count() - 1, 4) if os.cpu_count() > 1 else 0,
        "gradient_accumulation_steps": 2,
        "mixed_precision": False
    }
    if torch.cuda.is_available():
        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
        total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
        logger.info(f"GPU Memory: {total_memory:.2f} GB")
        torch.cuda.empty_cache()
        if total_memory > 10:
            config["batch_size"] = 16
            config["eval_batch_size"] = 32
            config["gradient_accumulation_steps"] = 1
        elif total_memory < 4:
            config["batch_size"] = 4
            config["eval_batch_size"] = 8
            config["gradient_accumulation_steps"] = 4
        config["mixed_precision"] = True
    return config

memory_config = optimize_memory()

# Text Processing Functions
@lru_cache(maxsize=1024)
def preprocess_text(text):
    if pd.isna(text) or text is None:
        return ""
    text = str(text).lower().strip()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s]', '', text)

    try:
        doc = nlp_spacy(text)
        lemmas = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
        return ' '.join(lemmas)
    except Exception as e:
        logger.warning(f"spaCy error: {e}. Returning cleaned text.")
        return text

def combine_text(df, text_cols):
    available_cols = [col for col in text_cols if col in df.columns]
    if not available_cols:
        logger.error("No valid text columns found")
        sys.exit(1)
    processed_cols = {col: df[col].apply(preprocess_text) if col in df.columns else pd.Series([""] * len(df)) for col in text_cols}
    combined_series = processed_cols[text_cols[0]].copy()
    for col in text_cols[1:]:
        mask = processed_cols[col] != ""
        combined_series[mask] = combined_series[mask] + " " + processed_cols[col][mask]
    return combined_series

def create_data_from_batch(batch_df, tokenizer, max_length=MAX_LENGTH):
    texts = batch_df["combined_input"].tolist()
    encodings = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    labels = None
    if "label" in batch_df:
        labels = torch.tensor(batch_df["label"].tolist())
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": labels
    }

# Load and Process Data
def load_data(file_path):
    cache_file = os.path.join(DATASET_CACHE_DIR, f"{os.path.basename(file_path)}.processed.pkl")
    if os.path.exists(cache_file):
        logger.info(f"Loading processed data from cache: {cache_file}")
        return pd.read_pickle(cache_file)

    logger.info("Loading and processing data")
    if not os.path.exists(file_path):
        logger.error(f"Input file not found: {file_path}")
        sys.exit(1)

    try:
        if file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path, engine='openpyxl')
        elif file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        sys.exit(1)

    required_cols = ["text", "type"]
    if not all(col in df.columns for col in required_cols):
        logger.error(f"Missing required columns: {required_cols}")
        sys.exit(1)

    df = df[df["type"].isin(LABEL_MAP.keys())]
    logger.info("Class distribution:")
    logger.info(df["type"].value_counts())

    text_cols = ["text", "topic", "article", "biased_words"]
    if PRELEMMATIZE:
        logger.info("Pre-lemmatizing dataset")
        for col in text_cols:
            if col in df.columns:
                df[f"{col}_preprocessed"] = df[col].apply(preprocess_text)
        df["combined_input"] = combine_text(df, [f"{col}_preprocessed" for col in text_cols])
    else:
        df["combined_input"] = combine_text(df, text_cols)

    df["label"] = df["type"].map(LABEL_MAP)

    logger.info(f"Sample data:\n{df[['combined_input', 'type']].head(5).to_string()}")
    df.to_pickle(cache_file)
    logger.info(f"Processed data cached to: {cache_file}")

    return df

# Dataset Class
class PoliticalBiasDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=MAX_LENGTH):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.has_labels = "label" in df.columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = {"combined_input": self.df.iloc[idx]["combined_input"]}
        if self.has_labels:
            item["label"] = self.df.iloc[idx]["label"]
        return item

    def collate_fn(self, batch):
        batch_df = pd.DataFrame(batch)
        return create_data_from_batch(batch_df, self.tokenizer, self.max_length)

# Custom Model with Additional Dense Layer
class CustomRobertaForSequenceClassification(AutoModelForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0][:, 0, :]
        sequence_output = self.dropout(sequence_output)
        sequence_output = self.dense(sequence_output)
        sequence_output = self.relu(sequence_output)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    class_report = classification_report(labels, preds, target_names=LABEL_MAP.keys(), output_dict=True)

    # Confusion matrix
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=LABEL_MAP.keys(), yticklabels=LABEL_MAP.keys())
    plt.xlabel('Predicted')
    plt.ylabel('True')
    cm_path = os.path.join(OUTPUT_DIR, 'confusion_matrix.png')
    plt.savefig(cm_path)
    plt.close()
    logger.info(f"Confusion matrix saved to {cm_path}")

    metrics = {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }
    for cls in LABEL_MAP.keys():
        metrics[f"f1_{cls}"] = class_report[cls]['f1-score']

    logger.info(f"Classification Report:\n{classification_report(labels, preds, target_names=LABEL_MAP.keys())}")
    return metrics

# Custom Trainer with Class Weights
class CustomTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs["logits"]
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Hyperparameter Tuning with Optuna
def run_hyperparameter_tuning(model, tokenizer, train_dataset, val_dataset, class_weights, output_dir, logging_dir, n_trials=N_OPTUNA_TRIALS):
    def objective(trial):
        training_args = TrainingArguments(
            output_dir=f"{output_dir}/trial_{trial.number}",
            logging_dir=logging_dir,
            do_train=True,
            do_eval=True,
            eval_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=100,
            per_device_train_batch_size=trial.suggest_categorical("batch_size", [4, 8, 16]),
            per_device_eval_batch_size=16,
            gradient_accumulation_steps=trial.suggest_int("gradient_accumulation_steps", 1, 4),
            num_train_epochs=trial.suggest_int("num_epochs", 5, 15),
            learning_rate=trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
            warmup_steps=trial.suggest_int("warmup_steps", 50, 200),
            weight_decay=trial.suggest_float("weight_decay", 0.0, 0.1),
            logging_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=1,
            report_to="none",
            fp16=torch.cuda.is_available(),
            dataloader_num_workers=min(os.cpu_count() - 1, 4) if os.cpu_count() > 1 else 0,
            remove_unused_columns=False,
            disable_tqdm=False,
            gradient_checkpointing=True,
            lr_scheduler_type="linear"
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            data_collator=train_dataset.collate_fn,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
            class_weights=class_weights
        )

        trainer.train()
        eval_results = trainer.evaluate()

        if trial.number == 0 or eval_results["eval_f1"] > trial.study.best_value:
            trainer.save_model(f"{output_dir}/best_model")
            tokenizer.save_pretrained(f"{output_dir}/best_model")
            logger.info(f"Saved best model from trial {trial.number} with F1: {eval_results['eval_f1']:.4f}")

        return eval_results["eval_f1"]

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    logger.info(f"Best hyperparameters: {study.best_params}")
    logger.info(f"Best F1-score: {study.best_value:.4f}")
    return study.best_params

# Main Pipeline
def main(file_path):
    logger.info("Starting main pipeline")

    # Load and prepare data
    df = load_data(file_path)

    # Train-Test Split
    logger.info("Splitting data")
    train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df["label"], random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

    logger.info(f"Train set: {len(train_df)} samples")
    logger.info(f"Validation set: {len(val_df)} samples")
    logger.info(f"Test set: {len(test_df)} samples")

    # Load Tokenizer and Model
    logger.info(f"Loading tokenizer and model: {MODEL_NAME}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
        model = CustomRobertaForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=len(LABEL_MAP),
            cache_dir=CACHE_DIR,
            ignore_mismatched_sizes=True
        )
        model.config.pad_token_id = tokenizer.pad_token_id
        logger.info(f"Successfully loaded model: {MODEL_NAME}")
    except Exception as e:
        logger.error(f"Failed to load model {MODEL_NAME}: {e}")
        raise

    model.to(device)

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_df["label"]), y=train_df["label"])
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
    logger.info(f"Class weights: {class_weights.tolist()}")

    # Create datasets
    train_dataset = PoliticalBiasDataset(train_df, tokenizer)
    val_dataset = PoliticalBiasDataset(val_df, tokenizer)
    test_dataset = PoliticalBiasDataset(test_df, tokenizer)

    # Run hyperparameter tuning or default training
    if RUN_HYPERPARAMETER_TUNING:
        logger.info("Running hyperparameter tuning with Optuna")
        best_params = run_hyperparameter_tuning(
            model, tokenizer, train_dataset, val_dataset, class_weights, OUTPUT_DIR, LOGGING_DIR
        )

        # Update training arguments with best hyperparameters
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            logging_dir=LOGGING_DIR,
            do_train=True,
            do_eval=True,
            eval_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=100,
            per_device_train_batch_size=best_params["batch_size"],
            per_device_eval_batch_size=16,
            gradient_accumulation_steps=best_params["gradient_accumulation_steps"],
            num_train_epochs=best_params["num_epochs"],
            learning_rate=best_params["learning_rate"],
            warmup_steps=best_params["warmup_steps"],
            weight_decay=best_params["weight_decay"],
            logging_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=1,
            report_to="none",
            fp16=torch.cuda.is_available(),
            dataloader_num_workers=min(os.cpu_count() - 1, 4) if os.cpu_count() > 1 else 0,
            remove_unused_columns=False,
            disable_tqdm=False,
            gradient_checkpointing=True,
            lr_scheduler_type="linear"
        )
    else:
        logger.info("Using default training arguments")
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            logging_dir=LOGGING_DIR,
            do_train=True,
            do_eval=True,
            eval_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=100,
            per_device_train_batch_size=memory_config["batch_size"],
            per_device_eval_batch_size=memory_config["eval_batch_size"],
            gradient_accumulation_steps=memory_config["gradient_accumulation_steps"],
            num_train_epochs=10,
            learning_rate=2e-5,
            warmup_steps=100,
            weight_decay=0.01,
            logging_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=1,
            report_to="none",
            fp16=memory_config["mixed_precision"],
            dataloader_num_workers=memory_config["num_workers"],
            remove_unused_columns=False,
            disable_tqdm=False,
            gradient_checkpointing=True,
            lr_scheduler_type="linear"
        )

    # Initialize Trainer
    logger.info("Initializing Trainer")
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=train_dataset.collate_fn,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        class_weights=class_weights
    )

    # Train
    logger.info("Starting training")
    trainer.train()
    trainer.save_metrics("all", trainer.metrics)
    logger.info(f"Training metrics saved to {LOGGING_DIR}")

    # Save model and tokenizer
    logger.info("Saving model and tokenizer")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    logger.info(f"Model and tokenizer saved to {OUTPUT_DIR}")

    # Evaluate on test set
    logger.info("Evaluating on test set")
    test_results = trainer.evaluate(test_dataset)
    logger.info(f"Test results: {test_results}")
    for metric, value in test_results.items():
        logger.info(f"{metric}: {value:.4f}")

    # Make predictions
    logger.info("Making predictions on all data")
    full_dataset = PoliticalBiasDataset(df[["combined_input"]], tokenizer)
    prediction_dataloader = DataLoader(
        full_dataset,
        batch_size=memory_config["eval_batch_size"],
        collate_fn=full_dataset.collate_fn,
        num_workers=memory_config["num_workers"],
        shuffle=False
    )

    all_predictions = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(prediction_dataloader, desc="Predicting"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs["logits"]
            batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_predictions.extend(batch_preds)

    # Save predictions
    df["predicted_bias_category"] = [REVERSE_LABEL_MAP[i] for i in all_predictions]
    df.to_csv(RESULTS_PATH, index=False)
    logger.info(f"Predictions saved to {RESULTS_PATH}")

    # Save misclassified examples
    misclassified = df[df["label"] != df["predicted_bias_category"].map(LABEL_MAP)]
    misclassified_path = os.path.join(OUTPUT_DIR, "misclassified_examples.csv")
    misclassified.to_csv(misclassified_path, index=False)
    logger.info(f"Misclassified examples saved to {misclassified_path}")

    # Clean up GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return test_results

if __name__ == "__main__":
    input_file_path = "/content/drive/MyDrive/coding/complete_balanced_data.csv"
    results = main(input_file_path)
    print(f"Final results: {results}")

KeyboardInterrupt: 