In [10]:
import logging
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import precision_recall_fscore_support, accuracy_score,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from datasets import Dataset
from joblib import load,dump
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [3]:

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "./results"
LOGGING_DIR = "./logs"
RESULTS_PATH = "./results/predictions.csv"
SAVE_PATH = OUTPUT_DIR
CHUNK_SIZE = 1000  # Adjust based on your memory capacity

# Ensure directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGGING_DIR, exist_ok=True)

# Text Processing Functions
def preprocess_text(text):
    """Preprocess text by converting to lowercase and handling non-string inputs."""
    if isinstance(text, list):
        return ' '.join(str(t).lower() for t in text if isinstance(t, str))
    return str(text).lower()

def combine_text(df, text_cols):
    """Combine text columns into a single 'combined_input' column."""
    for col in text_cols:
        df[col] = df[col].apply(preprocess_text) if col in df else ""
    df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

# Metrics
def compute_metrics(y_true, y_pred):
    """Compute evaluation metrics."""
    logger.info("Computing metrics")
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    acc = accuracy_score(y_true, y_pred)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Chunked Data Processing
def process_chunks(file_path, process_fn):
    """Process data in chunks and collect results."""
    results = []
    for chunk in pd.read_csv(file_path,chunksize=CHUNK_SIZE):
        processed = process_fn(chunk)
        results.append(processed)
    return pd.concat(results)

def preprocess_chunk(chunk):
    """Preprocess a single chunk of data."""
    chunk = chunk[chunk["type"].isin(LABEL_MAP.keys())]
    text_cols = ["text", "topic", "article", "biased_words"]
    chunk_processed = combine_text(chunk, text_cols)
    chunk_processed["label"] = chunk_processed["type"].map(LABEL_MAP)
    return chunk_processed

# Main Training Function
def train_model(X_train, y_train, X_eval, y_eval):
    """Train the Naive Bayes model with given datasets."""
    logger.info("Setting up Naive Bayes pipeline")
    
    # Create pipeline with TF-IDF vectorizer and Naive Bayes
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
        ('nb', MultinomialNB())
    ])
    
    # Train model
    logger.info("Training Naive Bayes model")
    pipeline.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = pipeline.predict(X_eval)
    metrics = compute_metrics(y_eval, y_pred)
    logger.info(f"Evaluation metrics: {metrics}")
    
    return pipeline

# Prediction Function
def predict_in_chunks(model, full_df, batch_size=1000):
    """Make predictions in batches to handle large datasets."""
    all_preds = []
    for i in range(0, len(full_df), batch_size):
        logger.info(f"Predicting batch {i//batch_size + 1}/{(len(full_df)//batch_size)+1}")
        chunk = full_df.iloc[i:i+batch_size]
        
        # Predict
        preds = model.predict(chunk["combined_input"])
        all_preds.extend(preds)
    
    return all_preds

# Main Pipeline
def main(file_path):
    logger.info("Starting chunked Naive Bayes pipeline")
    
    # 1. Load and preprocess data in chunks
    logger.info("Loading and preprocessing data in chunks")
    full_df = process_chunks(file_path, preprocess_chunk)
    
    # 2. Train-test split
    logger.info("Splitting data into train and eval sets")
    train_df, eval_df = train_test_split(
        full_df, 
        test_size=0.2, 
        stratify=full_df["label"], 
        random_state=42
    )
    
    # 3. Prepare data
    logger.info("Preparing data")
    X_train = train_df["combined_input"]
    y_train = train_df["label"]
    X_eval = eval_df["combined_input"]
    y_eval = eval_df["label"]
    
    # 4. Train model
    logger.info("Training model")
    model = train_model(X_train, y_train, X_eval, y_eval)
    
    # 5. Save model (using joblib for scikit-learn models)
    logger.info("Saving model")
    from joblib import dump
    os.makedirs(SAVE_PATH, exist_ok=True)
    dump(model, os.path.join(SAVE_PATH, "naive_bayes_model.joblib"))
    
    # 6. Make predictions
    logger.info("Making predictions")
    predictions = predict_in_chunks(model, full_df)
    full_df["predicted_bias_category"] = [REVERSE_LABEL_MAP[p] for p in predictions]
    
    # 7. Save results
    full_df.to_csv(RESULTS_PATH, index=False)
    logger.info(f"Results saved to {RESULTS_PATH}")
    
    return full_df

if __name__ == "__main__":
    input_file_path = "complete_balanced_data.csv"
    results_df = main(input_file_path)
    logger.info("completed successfully")

2025-05-07 15:05:43,864 - INFO - Starting chunked Naive Bayes pipeline
2025-05-07 15:05:43,865 - INFO - Loading and preprocessing data in chunks
2025-05-07 15:05:46,801 - INFO - Splitting data into train and eval sets
2025-05-07 15:05:46,865 - INFO - Preparing data
2025-05-07 15:05:46,866 - INFO - Training model
2025-05-07 15:05:46,867 - INFO - Setting up Naive Bayes pipeline
2025-05-07 15:05:46,868 - INFO - Training Naive Bayes model
2025-05-07 15:05:53,714 - INFO - Computing metrics
2025-05-07 15:05:53,723 - INFO - Evaluation metrics: {'accuracy': 0.9028097062579821, 'f1': 0.9027908482128976, 'precision': 0.9027999218746129, 'recall': 0.9028097062579821}
2025-05-07 15:05:53,724 - INFO - Saving model
2025-05-07 15:05:53,888 - INFO - Making predictions
2025-05-07 15:05:53,889 - INFO - Predicting batch 1/79
2025-05-07 15:05:53,985 - INFO - Predicting batch 2/79
2025-05-07 15:05:54,076 - INFO - Predicting batch 3/79
2025-05-07 15:05:54,171 - INFO - Predicting batch 4/79
2025-05-07 15:05:

OSError: Cannot save file into a non-existent directory: '/results'

In [9]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
MODEL_PATH = "./results/naive_bayes_model.joblib"
OUTPUT_PATH = "./results/sample_predictions.csv"

# Text Processing Function
def preprocess_text(text):
    """Preprocess text by converting to lowercase and handling non-string inputs."""
    if isinstance(text, list):
        return ' '.join(str(t).lower() for t in text if isinstance(t, str))
    return str(text).lower()

# Main Testing Function
def test_samples(samples):
    logger.info("Starting sample prediction pipeline with confidence scores")
    
    # 1. Prepare samples as DataFrame
    logger.info("Preprocessing samples")
    sample_df = pd.DataFrame({
        "combined_input": [preprocess_text(sample[0]) for sample in samples]
    })
    
    # 2. Load the trained model
    logger.info("Loading trained Naive Bayes model")
    if not os.path.exists(MODEL_PATH):
        logger.error(f"Model file not found at {MODEL_PATH}")
        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}")
    
    model = load(MODEL_PATH)
    
    # 3. Make predictions and get confidence scores
    logger.info("Making predictions on samples")
    predictions = model.predict(sample_df["combined_input"])
    probabilities = model.predict_proba(sample_df["combined_input"])
    
    # Extract confidence scores for predicted classes
    confidence_scores = [prob[pred] for prob, pred in zip(probabilities, predictions)]
    
    # Add predictions and confidence scores to DataFrame
    sample_df["predicted_bias_category"] = [REVERSE_LABEL_MAP[p] for p in predictions]
    sample_df["confidence_score"] = confidence_scores
    
    # 4. Save results
    sample_df.to_csv(OUTPUT_PATH, index=False)
    logger.info(f"Sample predictions saved to {OUTPUT_PATH}")
    
    # 5. Log predictions with confidence scores
    for i, (text, pred, score) in enumerate(zip(sample_df["combined_input"], sample_df["predicted_bias_category"], sample_df["confidence_score"])):
        logger.info(f"Sample {i+1}:")
        logger.info(f"Predicted Bias: {pred} \n confidence score: ({score:.4f})")
        logger.info(f"Sample text (truncated): {text[:100]}...")
    
    return sample_df

if __name__ == "__main__":
    # Provided samples
    samples = [
        ["""As wildfires rage across California, floods displace thousands in the Midwest, and heatwaves scorch cities from Texas to New York, the evidence is undeniable: the climate crisis is no longer a distant threat—it’s here. And yet, as communities suffer and ecosystems collapse, fossil fuel corporations continue to post record-breaking profits, protected by conservative politicians and a global system rigged in their favor.
        In 2024 alone, the five largest oil companies reported over $200 billion in profits. Instead of investing in renewable energy or helping vulnerable communities transition to a green economy, these corporations funneled billions into stock buybacks and executive bonuses. Their message is clear: profits come before people, and the planet can burn so long as the shareholders stay rich.
        Even more alarming is the political shielding they receive from right-wing lawmakers, many of whom deny climate science altogether. Republican leaders in Congress have repeatedly blocked climate legislation, gutted the Environmental Protection Agency’s regulatory powers, and prioritized drilling permits over clean air and water.
        Meanwhile, climate activists—many of them youth, Indigenous leaders, and marginalized communities—continue to face police repression, surveillance, and criminalization. Peaceful protesters at pipeline sites are arrested, while oil spills and environmental destruction go unpunished.
        We need a Green New Deal-level transformation: bold investments in wind, solar, and green infrastructure; the creation of millions of unionized green jobs; and climate reparations for communities hit hardest by pollution and environmental racism.
        The time for delay is over. The time to act is now."""],
        ["""The United States thrives when government steps back and lets free enterprise lead. In recent years, however, progressive lawmakers have increasingly pushed for regulation, redistribution, and intervention that stifles innovation and discourages hard work.
        From overreaching environmental mandates to government-controlled healthcare proposals, the left continues to champion policies that prioritize bureaucracy over results. These moves are not only anti-business—they’re anti-American.
        America's economic engine runs best when the private sector is free to create, compete, and grow. Small business owners across the country are already struggling with inflation and labor shortages—problems worsened by excessive government interference and rising taxes.
        We must return to policies that reward productivity, protect property rights, and uphold free-market values. Deregulation, tax reform, and energy independence will not only restore our economy—they’ll renew our national spirit."""],
        ["""As artificial intelligence tools become increasingly integrated into everyday life—from health diagnostics to criminal justice systems—Democratic and Republican lawmakers alike are recognizing the need for clear regulatory frameworks.
        A bipartisan group in Congress recently introduced the American AI Responsibility Act, aiming to address transparency, data privacy, and algorithmic bias. While the bill doesn’t go as far as some activists demand, it marks an important step toward balancing innovation with accountability.
        Tech CEOs have expressed cautious support, stating that some regulation is needed to maintain public trust, but they warn against overregulation that could drive development offshore.
        Experts agree: regulation must be careful, measured, and informed by the science—not by political theater. While divisions remain, the shared concern over AI’s risks may offer a rare opportunity for consensus in Washington."""],
        ["""In yet another blow to working-class Americans, Senate Republicans have blocked legislation that would raise the federal minimum wage to $17 per hour by 2027. With wages stagnant and inflation hitting food, rent, and transportation costs, the move is being widely condemned by labor leaders and economists.
        The current $7.25 minimum wage has not been raised since 2009, despite historic gains in productivity and corporate profits. Over 60% of Americans support a raise, but Republican lawmakers claim it would “hurt small businesses”—an argument that many economists say is overblown.
        In reality, the refusal to raise wages preserves exploitative systems where billion-dollar corporations rely on underpaid workers while CEO salaries skyrocket.
        This is not just about economics—it’s about dignity. Every American who works full-time should be able to afford basic necessities. Congress’s failure to act is a moral failure, and it’s up to voters to hold them accountable."""],
        ["""The southern border has long been a flashpoint in American politics, but recent data shows that tougher enforcement and advanced surveillance technology are yielding results. Illegal crossings dropped 30% in the first quarter of 2025 compared to the previous year, according to Homeland Security reports.
        Under the new measures, authorities have deployed AI-powered drones, reinforced border fencing, and accelerated asylum screening procedures. Critics on the left say the policies are “inhumane,” but officials argue they are necessary to protect national sovereignty and public safety.
        Drug seizures have also increased, particularly fentanyl shipments originating from cartels that exploit weak border points. Law enforcement agencies say the new tools and funding are making a significant impact.
        The Biden administration was slow to act early in its term, but this policy shift marks a necessary correction. The right to immigrate must be balanced with the rule of law—and American citizens deserve to feel safe and secure in their own country."""]
    ]
    
    # Run the prediction
    results_df = test_samples(samples)
    logger.info("Sample prediction completed successfully")

2025-05-07 15:16:42,230 - INFO - Starting sample prediction pipeline with confidence scores
2025-05-07 15:16:42,232 - INFO - Preprocessing samples
2025-05-07 15:16:42,233 - INFO - Loading trained Naive Bayes model
2025-05-07 15:16:42,304 - INFO - Making predictions on samples
2025-05-07 15:16:42,315 - INFO - Sample predictions saved to ./results/sample_predictions.csv
2025-05-07 15:16:42,316 - INFO - Sample 1:
2025-05-07 15:16:42,316 - INFO - Predicted Bias: left 
 confidence score: (0.8114)
2025-05-07 15:16:42,318 - INFO - Sample text (truncated): as wildfires rage across california, floods displace thousands in the midwest, and heatwaves scorch ...
2025-05-07 15:16:42,318 - INFO - Sample 2:
2025-05-07 15:16:42,319 - INFO - Predicted Bias: right 
 confidence score: (0.6545)
2025-05-07 15:16:42,320 - INFO - Sample text (truncated): the united states thrives when government steps back and lets free enterprise lead. in recent years,...
2025-05-07 15:16:42,320 - INFO - Sample 3:
2025-05-0

### Modifications to the model

In [11]:

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "./opt_results"
LOGGING_DIR = "./opt_logs"
MODELS_DIR = "./opt_models"
RESULTS_PATH = os.path.join(OUTPUT_DIR, "predictions.csv")
ERROR_ANALYSIS_PATH = os.path.join(OUTPUT_DIR, "error_analysis.csv")
CHUNK_SIZE = 2000  # Chunk size for memory efficiency

# Ensure directories exist
for directory in [OUTPUT_DIR, LOGGING_DIR, MODELS_DIR]:
    os.makedirs(directory, exist_ok=True)

# Custom Tokenizer for Production
def custom_tokenizer(text):
    """Efficient regex-based tokenizer for production use."""
    # Clean text: remove URLs, numbers, special characters
    text = re.sub(r'https?://\S+|www\.\S+|[^\w\s]|\d+', ' ', text.lower())
    # Tokenize by whitespace and filter tokens
    tokens = [token for token in text.split() if len(token) > 2 and token not in ENGLISH_STOP_WORDS]
    return tokens

# Text Processing Functions
def clean_text(text):
    """Efficient text cleaning function."""
    if not isinstance(text, str):
        text = str(text) if text else ""
    
    # Clean with regex
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+|<.*?>|[^\w\s]|\d+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def combine_text(df, text_cols):
    """Combine text columns into a single 'combined_input' column."""
    # Ensure all text columns exist
    available_cols = [col for col in text_cols if col in df]
    for col in set(text_cols) - set(available_cols):
        df[col] = ""
    
    # Apply cleaning to text columns
    for col in available_cols:
        df[col] = df[col].apply(clean_text)
    
    # Combine text columns with weights
    weights = {'text': 2, 'article': 2, 'topic': 1, 'biased_words': 1.5}
    def weighted_combine(row):
        combined = ""
        for col in available_cols:
            weight = int(weights.get(col, 1))
            combined += (" " + str(row[col]) + " ") * weight
        return combined.strip()
    
    df["combined_input"] = df[available_cols].apply(weighted_combine, axis=1)
    
    return df

# Metrics
def compute_metrics(y_true, y_pred):
    """Compute evaluation metrics."""
    logger.info("Computing metrics")
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    acc = accuracy_score(y_true, y_pred)
    
    # Compute per-class metrics
    class_precision, class_recall, class_f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    
    # Create confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    metrics = {
        "accuracy": acc, 
        "f1": f1, 
        "precision": precision, 
        "recall": recall,
        "class_precision": {REVERSE_LABEL_MAP[i]: p for i, p in enumerate(class_precision)},
        "class_recall": {REVERSE_LABEL_MAP[i]: r for i, r in enumerate(class_recall)},
        "class_f1": {REVERSE_LABEL_MAP[i]: f for i, f in enumerate(class_f1)},
        "confusion_matrix": cm
    }
    
    # Log metrics
    logger.info(f"Overall accuracy: {acc:.4f}")
    logger.info(f"Overall F1 score: {f1:.4f}")
    for i, label in REVERSE_LABEL_MAP.items():
        logger.info(f"Class '{label}' - Precision: {class_precision[i]:.4f}, "
                   f"Recall: {class_recall[i]:.4f}, F1: {class_f1[i]:.4f}")
    
    return metrics

def plot_confusion_matrix(cm, class_names):
    """Plot and save confusion matrix."""
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_matrix.png'))
    plt.close()

# Chunked Data Processing
def process_chunks(file_path, process_fn):
    """Process data in chunks and collect results."""
    results = []
    for chunk in pd.read_csv(file_path, chunksize=CHUNK_SIZE):
        processed = process_fn(chunk)
        results.append(processed)
    return pd.concat(results)

def preprocess_chunk(chunk):
    """Preprocess a single chunk of data."""
    chunk = chunk[chunk["type"].isin(LABEL_MAP.keys())].copy()
    text_cols = ["text", "topic", "article", "biased_words"]
    chunk_processed = combine_text(chunk, text_cols)
    chunk_processed["label"] = chunk_processed["type"].map(LABEL_MAP)
    return chunk_processed

# Error Analysis
def perform_error_analysis(model, eval_df, X_eval, y_eval):
    """Analyze prediction errors."""
    logger.info("Performing error analysis")
    
    y_pred = model.predict(X_eval)
    y_proba = model.predict_proba(X_eval)
    
    error_df = eval_df.copy()
    error_df['true_label'] = error_df['label'].map(REVERSE_LABEL_MAP)
    error_df['predicted_label'] = [REVERSE_LABEL_MAP[p] for p in y_pred]
    error_df['correct'] = error_df['label'] == y_pred
    
    # Add confidence scores
    for i, label in REVERSE_LABEL_MAP.items():
        error_df[f'confidence_{label}'] = y_proba[:, i]
    error_df['confidence'] = [y_proba[i, pred] for i, pred in enumerate(y_pred)]
    
    # Save errors
    errors_only = error_df[~error_df['correct']].sort_values('confidence')
    errors_only.to_csv(ERROR_ANALYSIS_PATH, index=False)
    
    error_count = (~error_df['correct']).sum()
    error_rate = error_count / len(error_df)
    logger.info(f"Error rate: {error_rate:.4f} ({error_count}/{len(error_df)})")
    logger.info(f"Error analysis saved to {ERROR_ANALYSIS_PATH}")
    
    return errors_only

# Prediction Function
def predict_in_chunks(model, full_df, batch_size=2000):
    """Make predictions in batches with confidence scores."""
    all_preds = []
    all_confidence = []
    
    for i in range(0, len(full_df), batch_size):
        logger.info(f"Predicting batch {i//batch_size + 1}/{(len(full_df)//batch_size)+1}")
        chunk = full_df.iloc[i:i+batch_size]
        
        proba = model.predict_proba(chunk["combined_input"])
        pred_class = np.argmax(proba, axis=1)
        confidence = np.max(proba, axis=1)
        
        preds = [REVERSE_LABEL_MAP[p] for p in pred_class]
        
        all_preds.extend(preds)
        all_confidence.extend(confidence)
    
    return all_preds, all_confidence

# Main Pipeline
def main(file_path, vectorizer_type='count', classifier_type='naive_bayes', grid_search=True):
    """Main pipeline for Naive Bayes classification with optimized vectorizer."""
    logger.info(f"Starting Naive Bayes pipeline with {vectorizer_type} vectorizer")
    
    # 1. Load and preprocess data in chunks
    logger.info("Loading and preprocessing data in chunks")
    full_df = process_chunks(file_path, preprocess_chunk)
    
    # Save processed data
    processed_data_path = os.path.join(OUTPUT_DIR, "processed_data.parquet")
    full_df.to_parquet(processed_data_path, index=False)
    logger.info(f"Saved processed data to {processed_data_path}")
    
    # 2. Train-test split
    logger.info("Splitting data into train and eval sets")
    train_df, eval_df = train_test_split(
        full_df, 
        test_size=0.2, 
        stratify=full_df["label"], 
        random_state=42
    )
    
    # 3. Prepare data
    logger.info("Preparing data")
    X_train = train_df["combined_input"]
    y_train = train_df["label"].values
    X_eval = eval_df["combined_input"]
    y_eval = eval_df["label"].values
    
    # 4. Build pipeline
    logger.info("Setting up Naive Bayes pipeline")
    vectorizer = CountVectorizer(
        max_features=10000,
        min_df=3,
        max_df=0.9,
        ngram_range=(1, 2),
        tokenizer=custom_tokenizer,
        stop_words=None  # Stop words handled in custom tokenizer
    )
    
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('clf', MultinomialNB())
    ])
    
    # 5. Hyperparameter tuning
    if grid_search:
        logger.info("Performing hyperparameter tuning")
        param_grid = {
            'vectorizer__max_features': [5000, 10000],
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'clf__alpha': [0.1, 0.5, 1.0]
        }
        
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,
            scoring='f1_weighted',
            n_jobs=-1,
            verbose=1
        )
        
        logger.info("Training with GridSearchCV")
        grid_search.fit(X_train, y_train)
        pipeline = grid_search.best_estimator_
        logger.info(f"Best parameters: {grid_search.best_params_}")
    else:
        logger.info("Training without hyperparameter tuning")
        pipeline.fit(X_train, y_train)
    
    # 6. Evaluate model
    logger.info("Evaluating model")
    y_pred = pipeline.predict(X_eval)
    metrics = compute_metrics(y_eval, y_pred)
    plot_confusion_matrix(metrics["confusion_matrix"], list(LABEL_MAP.keys()))
    
    # 7. Save model
    logger.info("Saving model")
    model_path = os.path.join(MODELS_DIR, "naive_bayes_model.joblib")
    dump(pipeline, model_path)
    logger.info(f"Model saved to {model_path}")
    
    # 8. Error analysis
    error_analysis = perform_error_analysis(pipeline, eval_df, X_eval, y_eval)
    
    # 9. Make predictions
    logger.info("Making predictions")
    predictions, confidence_scores = predict_in_chunks(pipeline, full_df)
    full_df["predicted_bias_category"] = predictions
    full_df["confidence_score"] = confidence_scores
    
    # 10. Save results
    full_df.to_csv(RESULTS_PATH, index=False)
    logger.info(f"Results saved to {RESULTS_PATH}")
    
    # 11. Compute confidence statistics
    confidence_stats = {
        "mean_confidence": np.mean(confidence_scores),
        "std_confidence": np.std(confidence_scores),
        "min_confidence": np.min(confidence_scores),
        "max_confidence": np.max(confidence_scores)
    }
    logger.info(f"Confidence stats: {confidence_stats}")
    
    return full_df, confidence_stats, error_analysis

if __name__ == "__main__":
    input_file_path = "complete_balanced_data.csv"
    results_df, confidence_stats, error_analysis = main(
        input_file_path, 
        vectorizer_type='count',
        classifier_type='naive_bayes',
        grid_search=True
    )
    logger.info("Pipeline completed successfully")

2025-05-07 15:37:47,186 - INFO - Starting Naive Bayes pipeline with count vectorizer
2025-05-07 15:37:47,187 - INFO - Loading and preprocessing data in chunks
2025-05-07 15:37:59,114 - INFO - Saved processed data to ./opt_results/processed_data.parquet
2025-05-07 15:37:59,116 - INFO - Splitting data into train and eval sets
2025-05-07 15:37:59,167 - INFO - Preparing data
2025-05-07 15:37:59,169 - INFO - Setting up Naive Bayes pipeline
2025-05-07 15:37:59,169 - INFO - Performing hyperparameter tuning
2025-05-07 15:37:59,170 - INFO - Training with GridSearchCV


Fitting 3 folds for each of 12 candidates, totalling 36 fits


2025-05-07 15:41:33,944 - INFO - Best parameters: {'clf__alpha': 0.1, 'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 2)}
2025-05-07 15:41:33,945 - INFO - Evaluating model
2025-05-07 15:41:38,611 - INFO - Computing metrics
2025-05-07 15:41:38,629 - INFO - Overall accuracy: 0.8528
2025-05-07 15:41:38,629 - INFO - Overall F1 score: 0.8506
2025-05-07 15:41:38,630 - INFO - Class 'left' - Precision: 0.7820, Recall: 0.9351, F1: 0.8517
2025-05-07 15:41:38,631 - INFO - Class 'center' - Precision: 0.9548, Recall: 0.7048, F1: 0.8110
2025-05-07 15:41:38,632 - INFO - Class 'right' - Precision: 0.8616, Recall: 0.9186, F1: 0.8892
2025-05-07 15:41:39,084 - INFO - Saving model
2025-05-07 15:41:39,434 - INFO - Model saved to ./opt_models/naive_bayes_model.joblib
2025-05-07 15:41:39,435 - INFO - Performing error analysis
2025-05-07 15:41:50,407 - INFO - Error rate: 0.1472 (2305/15660)
2025-05-07 15:41:50,408 - INFO - Error analysis saved to ./opt_results/error_analysis.csv
2025-05-07 1

### Test Model

In [13]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
MODEL_PATH = "./opt_models/naive_bayes_model.joblib"
OUTPUT_PATH = "./opt_results/sample_predictions.csv"

# Text Processing Function
def preprocess_text(text):
    """Preprocess text by converting to lowercase and handling non-string inputs."""
    if isinstance(text, list):
        return ' '.join(str(t).lower() for t in text if isinstance(t, str))
    return str(text).lower()

# Main Testing Function
def test_samples(samples):
    logger.info("Starting sample prediction pipeline with confidence scores")
    
    # 1. Prepare samples as DataFrame
    logger.info("Preprocessing samples")
    sample_df = pd.DataFrame({
        "combined_input": [preprocess_text(sample[0]) for sample in samples]
    })
    
    # 2. Load the trained model
    logger.info("Loading trained Naive Bayes model")
    if not os.path.exists(MODEL_PATH):
        logger.error(f"Model file not found at {MODEL_PATH}")
        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}")
    
    model = load(MODEL_PATH)
    
    # 3. Make predictions and get confidence scores
    logger.info("Making predictions on samples")
    predictions = model.predict(sample_df["combined_input"])
    probabilities = model.predict_proba(sample_df["combined_input"])
    
    # Extract confidence scores for predicted classes
    confidence_scores = [prob[pred] for prob, pred in zip(probabilities, predictions)]
    
    # Add predictions and confidence scores to DataFrame
    sample_df["predicted_bias_category"] = [REVERSE_LABEL_MAP[p] for p in predictions]
    sample_df["confidence_score"] = confidence_scores
    
    # 4. Save results
    sample_df.to_csv(OUTPUT_PATH, index=False)
    logger.info(f"Sample predictions saved to {OUTPUT_PATH}")
    
    # 5. Log predictions with confidence scores
    for i, (text, pred, score) in enumerate(zip(sample_df["combined_input"], sample_df["predicted_bias_category"], sample_df["confidence_score"])):
        logger.info(f"Sample {i+1}:")
        logger.info(f"Predicted Bias: {pred} \n confidence score: ({score:.4f})")
        logger.info(f"Sample text (truncated): {text[:100]}...")
    
    return sample_df

if __name__ == "__main__":
    # Provided samples
    samples = [
        ["""As wildfires rage across California, floods displace thousands in the Midwest, and heatwaves scorch cities from Texas to New York, the evidence is undeniable: the climate crisis is no longer a distant threat—it’s here. And yet, as communities suffer and ecosystems collapse, fossil fuel corporations continue to post record-breaking profits, protected by conservative politicians and a global system rigged in their favor.
        In 2024 alone, the five largest oil companies reported over $200 billion in profits. Instead of investing in renewable energy or helping vulnerable communities transition to a green economy, these corporations funneled billions into stock buybacks and executive bonuses. Their message is clear: profits come before people, and the planet can burn so long as the shareholders stay rich.
        Even more alarming is the political shielding they receive from right-wing lawmakers, many of whom deny climate science altogether. Republican leaders in Congress have repeatedly blocked climate legislation, gutted the Environmental Protection Agency’s regulatory powers, and prioritized drilling permits over clean air and water.
        Meanwhile, climate activists—many of them youth, Indigenous leaders, and marginalized communities—continue to face police repression, surveillance, and criminalization. Peaceful protesters at pipeline sites are arrested, while oil spills and environmental destruction go unpunished.
        We need a Green New Deal-level transformation: bold investments in wind, solar, and green infrastructure; the creation of millions of unionized green jobs; and climate reparations for communities hit hardest by pollution and environmental racism.
        The time for delay is over. The time to act is now."""],
        ["""The United States thrives when government steps back and lets free enterprise lead. In recent years, however, progressive lawmakers have increasingly pushed for regulation, redistribution, and intervention that stifles innovation and discourages hard work.
        From overreaching environmental mandates to government-controlled healthcare proposals, the left continues to champion policies that prioritize bureaucracy over results. These moves are not only anti-business—they’re anti-American.
        America's economic engine runs best when the private sector is free to create, compete, and grow. Small business owners across the country are already struggling with inflation and labor shortages—problems worsened by excessive government interference and rising taxes.
        We must return to policies that reward productivity, protect property rights, and uphold free-market values. Deregulation, tax reform, and energy independence will not only restore our economy—they’ll renew our national spirit."""],
        ["""As artificial intelligence tools become increasingly integrated into everyday life—from health diagnostics to criminal justice systems—Democratic and Republican lawmakers alike are recognizing the need for clear regulatory frameworks.
        A bipartisan group in Congress recently introduced the American AI Responsibility Act, aiming to address transparency, data privacy, and algorithmic bias. While the bill doesn’t go as far as some activists demand, it marks an important step toward balancing innovation with accountability.
        Tech CEOs have expressed cautious support, stating that some regulation is needed to maintain public trust, but they warn against overregulation that could drive development offshore.
        Experts agree: regulation must be careful, measured, and informed by the science—not by political theater. While divisions remain, the shared concern over AI’s risks may offer a rare opportunity for consensus in Washington."""],
        ["""In yet another blow to working-class Americans, Senate Republicans have blocked legislation that would raise the federal minimum wage to $17 per hour by 2027. With wages stagnant and inflation hitting food, rent, and transportation costs, the move is being widely condemned by labor leaders and economists.
        The current $7.25 minimum wage has not been raised since 2009, despite historic gains in productivity and corporate profits. Over 60% of Americans support a raise, but Republican lawmakers claim it would “hurt small businesses”—an argument that many economists say is overblown.
        In reality, the refusal to raise wages preserves exploitative systems where billion-dollar corporations rely on underpaid workers while CEO salaries skyrocket.
        This is not just about economics—it’s about dignity. Every American who works full-time should be able to afford basic necessities. Congress’s failure to act is a moral failure, and it’s up to voters to hold them accountable."""],
        ["""The southern border has long been a flashpoint in American politics, but recent data shows that tougher enforcement and advanced surveillance technology are yielding results. Illegal crossings dropped 30% in the first quarter of 2025 compared to the previous year, according to Homeland Security reports.
        Under the new measures, authorities have deployed AI-powered drones, reinforced border fencing, and accelerated asylum screening procedures. Critics on the left say the policies are “inhumane,” but officials argue they are necessary to protect national sovereignty and public safety.
        Drug seizures have also increased, particularly fentanyl shipments originating from cartels that exploit weak border points. Law enforcement agencies say the new tools and funding are making a significant impact.
        The Biden administration was slow to act early in its term, but this policy shift marks a necessary correction. The right to immigrate must be balanced with the rule of law—and American citizens deserve to feel safe and secure in their own country."""]
    ]
    
    # Run the prediction
    results_df = test_samples(samples)
    logger.info("Sample prediction completed successfully")

2025-05-07 16:30:40,128 - INFO - Starting sample prediction pipeline with confidence scores
2025-05-07 16:30:40,129 - INFO - Preprocessing samples
2025-05-07 16:30:40,131 - INFO - Loading trained Naive Bayes model


2025-05-07 16:30:40,326 - INFO - Making predictions on samples
2025-05-07 16:30:40,334 - INFO - Sample predictions saved to ./opt_results/sample_predictions.csv
2025-05-07 16:30:40,335 - INFO - Sample 1:
2025-05-07 16:30:40,336 - INFO - Predicted Bias: left 
 confidence score: (1.0000)
2025-05-07 16:30:40,337 - INFO - Sample text (truncated): as wildfires rage across california, floods displace thousands in the midwest, and heatwaves scorch ...
2025-05-07 16:30:40,338 - INFO - Sample 2:
2025-05-07 16:30:40,339 - INFO - Predicted Bias: right 
 confidence score: (1.0000)
2025-05-07 16:30:40,339 - INFO - Sample text (truncated): the united states thrives when government steps back and lets free enterprise lead. in recent years,...
2025-05-07 16:30:40,341 - INFO - Sample 3:
2025-05-07 16:30:40,341 - INFO - Predicted Bias: center 
 confidence score: (0.8471)
2025-05-07 16:30:40,342 - INFO - Sample text (truncated): as artificial intelligence tools become increasingly integrated into everyda