In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import logging
import pickle

### Ensemble model - logistic,rf,xgb,lgbm

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/results"
LOGGING_DIR = "/content/drive/MyDrive/Colab Notebooks/logs"
RESULTS_PATH = "/content/drive/MyDrive/Colab Notebooks/predictions.csv"
MODEL_PATH = os.path.join(OUTPUT_DIR, "ensemble_model.pkl")

# Ensure directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGGING_DIR, exist_ok=True)
os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)

# ---------------- Text Processing Functions ----------------
def preprocess_text(text):
    return ' '.join(str(t).lower() for t in text if isinstance(t, str)) if isinstance(text, list) else str(text).lower()

def combine_text(df, text_cols):
    for col in text_cols:
        df[col] = df[col].apply(preprocess_text) if col in df else ""
    df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

# ---------------- Load & Preprocess ----------------
def load_and_preprocess(file_path):
    logger.info("Loading and preprocessing data")
    df = pd.read_excel(file_path, engine='openpyxl')
    df = df[df["type"].isin(LABEL_MAP.keys())]

    logger.info("Original distribution:")
    logger.info(df["type"].value_counts())

    text_cols = ["text", "topic", "article", "biased_words"]
    df_processed = combine_text(df, text_cols)
    df_processed["label"] = df_processed["type"].map(LABEL_MAP)

    return df_processed

# ---------------- Metrics ----------------
def compute_metrics(labels, preds):
    logger.info("Computing metrics")
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ---------------- Main Pipeline ----------------
def main(file_path):
    logger.info("Starting main pipeline")
    # Load and prepare data
    df = load_and_preprocess(file_path)

    # Train-Test Split
    logger.info("Splitting data")
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

    # Create a pipeline with TF-IDF
    logger.info("Setting up pipeline with TF-IDF")
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

    # Transform text to TF-IDF features
    logger.info("Transforming training data to TF-IDF")
    X_train_tfidf = tfidf.fit_transform(train_df["combined_input"])
    X_eval_tfidf = tfidf.transform(eval_df["combined_input"])

    # Apply SMOTE to balance the training data
    logger.info("Applying SMOTE for balancing")
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, train_df["label"])

    logger.info("Original train distribution:")
    logger.info(train_df["type"].value_counts())
    logger.info("Balanced train distribution:")
    logger.info(pd.Series(y_train_res).map(REVERSE_LABEL_MAP).value_counts())

    # Define individual models
    logger.info("Setting up ensemble models")
    logistic = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    lgbm = LGBMClassifier(random_state=42, verbose=-1)

    # Create the ensemble with soft voting
    ensemble = VotingClassifier(
        estimators=[
            ('logistic', logistic),
            ('rf', rf),
            ('xgb', xgb),
            ('lgbm', lgbm)
        ],
        voting='soft'  # Use soft voting to average probabilities
    )

    # Train the ensemble model
    logger.info("Training ensemble model")
    ensemble.fit(X_train_res, y_train_res)

    # Evaluate on validation set
    logger.info("Evaluating model")
    eval_preds = ensemble.predict(X_eval_tfidf)
    metrics = compute_metrics(eval_df["label"], eval_preds)
    logger.info("Evaluation Metrics:")
    logger.info(metrics)

    # Save the model and TF-IDF vectorizer
    logger.info("Saving model")
    with open(MODEL_PATH, 'wb') as f:
        pickle.dump({'ensemble': ensemble, 'tfidf': tfidf}, f)
    logger.info(f"Model saved to {MODEL_PATH}")

    # Prediction on full dataset
    logger.info("Making predictions")
    X_full_tfidf = tfidf.transform(df["combined_input"])
    pred_labels = ensemble.predict(X_full_tfidf)
    df["predicted_bias_category"] = [REVERSE_LABEL_MAP[i] for i in pred_labels]
    df.to_csv(RESULTS_PATH, index=False)
    logger.info(f"Predictions saved to {RESULTS_PATH}")

if __name__ == "__main__":
    input_file_path = "/content/drive/MyDrive/Colab Notebooks/combined_data.xlsx"
    main(input_file_path)

Parameters: { "use_label_encoder" } are not used.



### Test the data

In [None]:
import pandas as pd
import pickle
import logging
import os
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/results"
MODEL_PATH = os.path.join(OUTPUT_DIR, "ensemble_model.pkl")

# Mount Google Drive (for Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    os.makedirs(OUTPUT_DIR, exist_ok=True)  # Ensure the directory exists
except ImportError:
    logger.info("Not running in Colab; skipping drive mount.")
    os.makedirs(OUTPUT_DIR, exist_ok=True)  # Create directory if not in Colab

# ---------------- Text Processing Functions ----------------
def preprocess_text(text):
    return ' '.join(str(t).lower() for t in text if isinstance(t, str)) if isinstance(text, list) else str(text).lower()

def combine_text(df, text_cols):
    for col in text_cols:
        df[col] = df[col].apply(preprocess_text) if col in df else ""
    df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

# ---------------- Load Model and TF-IDF ----------------
def load_model():
    logger.info(f"Attempting to load model from {MODEL_PATH}")
    if not os.path.exists(MODEL_PATH):
        logger.error(f"Model file not found at {MODEL_PATH}. Please upload the file 'ensemble_model.pkl' to this directory.")
        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}. Please ensure the file is uploaded to {OUTPUT_DIR}.")
    with open(MODEL_PATH, 'rb') as f:
        saved_data = pickle.load(f)
    return saved_data['ensemble'], saved_data['tfidf']

# ---------------- Test with Sample Data ----------------
def test_model():
    # Create sample data with true labels
    sample_data = pd.DataFrame({
        "text": [
            "The government must provide universal healthcare to ensure equality for all citizens.",
            "The new policy aims to balance economic growth with environmental sustainability.",
            "Lower taxes and deregulation are key to boosting economic freedom and growth."
        ],
        "topic": ["Healthcare", "Policy", "Economy"],
        "article": ["", "", ""],
        "biased_words": ["equality", "balance", "freedom"],
        "true_bias": ["left", "center", "right"]  # Hypothetical true labels
    })

    logger.info("Sample data created:")
    logger.info(sample_data)

    # Preprocess sample data
    text_cols = ["text", "topic", "article", "biased_words"]
    sample_data_processed = combine_text(sample_data, text_cols)

    # Load model and TF-IDF vectorizer
    ensemble, tfidf = load_model()

    # Transform sample data to TF-IDF features
    logger.info("Transforming sample data to TF-IDF features")
    X_sample_tfidf = tfidf.transform(sample_data_processed["combined_input"])

    # Make predictions and get probabilities
    logger.info("Making predictions")
    pred_labels = ensemble.predict(X_sample_tfidf)
    pred_probs = ensemble.predict_proba(X_sample_tfidf)
    confidences = np.max(pred_probs, axis=1)  # Take the maximum probability as confidence

    # Add predictions and confidences to the DataFrame
    sample_data["predicted_bias_category"] = [REVERSE_LABEL_MAP[i] for i in pred_labels]
    sample_data["confidence"] = confidences

    # Print formatted output for each sample
    for index, row in sample_data.iterrows():
        text = row["text"]
        true_label = row["true_bias"]
        readable_label = row["predicted_bias_category"]
        confidence = row["confidence"]
        print(f"\nText: {text[:100]}...")
        print(f"True Bias: {true_label} | Predicted: {readable_label} ({confidence:.2f})")

    return sample_data

if __name__ == "__main__":
    result = test_model()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Text: the government must provide universal healthcare to ensure equality for all citizens....
True Bias: left | Predicted: right (0.54)

Text: the new policy aims to balance economic growth with environmental sustainability....
True Bias: center | Predicted: right (0.46)

Text: lower taxes and deregulation are key to boosting economic freedom and growth....
True Bias: right | Predicted: right (0.47)




In [None]:
import pandas as pd
import pickle
import logging
import os
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/results"
MODEL_PATH = os.path.join(OUTPUT_DIR, "ensemble_model.pkl")
TEST_FILE_PATH = "/content/drive/MyDrive/Colab Notebooks/test_sample.xlsx"

# Mount Google Drive (for Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    os.makedirs(OUTPUT_DIR, exist_ok=True)  # Ensure the directory exists
except ImportError:
    logger.info("Not running in Colab; skipping drive mount.")
    os.makedirs(OUTPUT_DIR, exist_ok=True)  # Create directory if not in Colab

# ---------------- Text Processing Functions ----------------
def preprocess_text(text):
    return ' '.join(str(t).lower() for t in text if isinstance(t, str)) if isinstance(text, list) else str(text).lower()

def combine_text(df, text_cols):
    for col in text_cols:
        df[col] = df[col].apply(preprocess_text) if col in df else ""
    df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

# ---------------- Load Model and TF-IDF ----------------
def load_model():
    logger.info(f"Attempting to load model from {MODEL_PATH}")
    if not os.path.exists(MODEL_PATH):
        logger.error(f"Model file not found at {MODEL_PATH}. Please upload the file 'ensemble_model.pkl' to this directory.")
        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}. Please ensure the file is uploaded to {OUTPUT_DIR}.")
    with open(MODEL_PATH, 'rb') as f:
        saved_data = pickle.load(f)
    return saved_data['ensemble'], saved_data['tfidf']

# ---------------- Test with New Data ----------------
def test_new_data():
    # Load the test file
    logger.info(f"Loading test data from {TEST_FILE_PATH}")
    df_test = pd.read_excel(TEST_FILE_PATH, engine='openpyxl')

    # Preview columns
    logger.info("Columns in the test dataset:")
    logger.info(df_test.columns)

    # Use first 30 rows for testing
    sample_rows = df_test.head(30)

    # Ensure required columns exist, fill with empty strings if missing
    required_cols = ["sentence", "topic", "article", "biased_words"]
    for col in required_cols:
        if col not in sample_rows.columns:
            sample_rows[col] = ""
    if 'type' not in sample_rows.columns:
        sample_rows['type'] = "Unknown"

    # Preprocess sample data
    text_cols = ["sentence", "topic", "article", "biased_words"]
    sample_data_processed = combine_text(sample_rows, text_cols)

    # Load model and TF-IDF vectorizer
    ensemble, tfidf = load_model()

    # Transform sample data to TF-IDF features
    logger.info("Transforming sample data to TF-IDF features")
    X_sample_tfidf = tfidf.transform(sample_data_processed["combined_input"])

    # Make predictions and get probabilities
    logger.info("Making predictions")
    pred_labels = ensemble.predict(X_sample_tfidf)
    pred_probs = ensemble.predict_proba(X_sample_tfidf)
    confidences = np.max(pred_probs, axis=1)  # Take the maximum probability as confidence

    # Print formatted output for each sample
    for idx, (index, row) in enumerate(sample_rows.iterrows()):
        text = row["sentence"]
        true_label = row["type"]
        readable_label = REVERSE_LABEL_MAP[pred_labels[idx]]
        confidence = confidences[idx]
        print(f"\nText: {text[:100]}...")
        print(f"True Bias: {true_label} | Predicted: {readable_label} ({confidence:.2f})")

if __name__ == "__main__":
    test_new_data()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_rows[col] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(preprocess_text) if col in df else ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)



Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: right (0.84)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: right (0.84)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: right (0.84)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: right (0.84)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: right (0.84)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: right (0.84)

Text: "orange is the new black" star yael stone is renouncing her u.s. gree



### With optimization anf hyperparameter tuning methods

In [None]:
import pandas as pd
import pickle
import logging
import os
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import joblib

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/results_opt"
LOGGING_DIR = "/content/drive/MyDrive/Colab Notebooks/logs_opt"
RESULTS_PATH = "/content/drive/MyDrive/Colab Notebooks/predictions_bias.csv"
MODEL_PATH = os.path.join(OUTPUT_DIR, "ensemble_model_bias.pkl")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGGING_DIR, exist_ok=True)
os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)

# ---------------- Text Processing Functions ----------------
def preprocess_text(text):
    return ' '.join(str(t).lower() for t in text if isinstance(t, str)) if isinstance(text, list) else str(text).lower()

def combine_text(df, text_cols):
    for col in text_cols:
        df[col] = df[col].apply(preprocess_text) if col in df else ""
    df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

# ---------------- Load & Preprocess ----------------
def load_and_preprocess(file_path):
    logger.info("Loading and preprocessing data")
    df = pd.read_excel(file_path, engine='openpyxl')
    df = df[df["type"].isin(LABEL_MAP.keys())]

    logger.info("Original distribution:")
    logger.info(df["type"].value_counts())

    text_cols = ["text", "topic", "article", "biased_words"]
    df_processed = combine_text(df, text_cols)
    df_processed["label"] = df_processed["type"].map(LABEL_MAP)

    return df_processed

# ---------------- Metrics ----------------
def compute_metrics(labels, preds):
    logger.info("Computing metrics")
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ---------------- Main Pipeline with Optimization ----------------
def main(file_path):
    logger.info("Starting main pipeline")
    # Load and prepare data
    df = load_and_preprocess(file_path)

    # Train-Test Split
    logger.info("Splitting data")
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

    # Create a pipeline with TF-IDF with optimized features
    logger.info("Setting up pipeline with TF-IDF")
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2), min_df=5)

    # Transform text to TF-IDF features
    logger.info("Transforming training data to TF-IDF")
    X_train_tfidf = tfidf.fit_transform(train_df["combined_input"])
    X_eval_tfidf = tfidf.transform(eval_df["combined_input"])

    # Scale features for gradient descent-based models
    scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
    X_train_scaled = scaler.fit_transform(X_train_tfidf)
    X_eval_scaled = scaler.transform(X_eval_tfidf)

    # Apply SMOTE to balance the training data
    logger.info("Applying SMOTE for balancing")
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_scaled, train_df["label"])

    logger.info("Original train distribution:")
    logger.info(train_df["type"].value_counts())
    logger.info("Balanced train distribution:")
    logger.info(pd.Series(y_train_res).map(REVERSE_LABEL_MAP).value_counts())

    # Define individual models with hyperparameter tuning
    logger.info("Setting up ensemble models with gradient descent optimization")

    # Logistic Regression
    logistic = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)
    logistic_params = {
        'C': [0.01, 0.1, 1, 10],
        'max_iter': [1000, 2000, 3000],
    }
    logistic_search = RandomizedSearchCV(logistic, logistic_params, n_iter=5, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

    # Random Forest
    rf = RandomForestClassifier(random_state=42)
    rf_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    rf_search = RandomizedSearchCV(rf, rf_params, n_iter=5, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

    # XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42,tree_method='gpu_hist')
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
    xgb_search = RandomizedSearchCV(xgb, xgb_params, n_iter=5, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

    # Train all models with hyperparameter tuning
    logger.info("Training Logistic Regression")
    logistic_search.fit(X_train_res, y_train_res)
    logger.info(f"Best Logistic Regression params: {logistic_search.best_params_}")

    logger.info("Training Random Forest")
    rf_search.fit(X_train_res, y_train_res)
    logger.info(f"Best Random Forest params: {rf_search.best_params_}")

    logger.info("Training XGBoost")
    xgb_search.fit(X_train_res, y_train_res)
    logger.info(f"Best XGBoost params: {xgb_search.best_params_}")

    # Create ensemble model
    logger.info("Creating ensemble model")
    ensemble = VotingClassifier(
        estimators=[
            ('logistic', logistic_search.best_estimator_),
            ('random_forest', rf_search.best_estimator_),
            ('xgboost', xgb_search.best_estimator_)
        ],
        voting='soft'  # Use soft voting for probability-based weighting
    )
    ensemble.fit(X_train_res, y_train_res)

    # Evaluate on validation set
    logger.info("Evaluating on validation set")
    val_preds = ensemble.predict(X_eval_scaled)
    metrics = compute_metrics(eval_df["label"], val_preds)
    logger.info(f"Validation metrics: {metrics}")

    # Save predictions
    logger.info("Saving predictions")
    eval_df["prediction"] = val_preds
    eval_df["predicted_type"] = eval_df["prediction"].map(REVERSE_LABEL_MAP)
    eval_df.to_csv(RESULTS_PATH, index=False)

    # Save model
    logger.info(f"Saving model to {MODEL_PATH}")
    joblib.dump({
        'model': ensemble,
        'tfidf': tfidf,
        'scaler': scaler,
        'label_map': LABEL_MAP
    }, MODEL_PATH)

    logger.info("Pipeline completed successfully")
    return metrics

if __name__ == "__main__":
    # Example usage (replace with your actual file path)
    file_path = "/content/drive/MyDrive/Colab Notebooks/balanced_data.xlsx"
    main(file_path)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [None]:
import pandas as pd
import pickle
import logging
import os
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/results"
MODEL_PATH = os.path.join(OUTPUT_DIR, "ensemble_model.pkl")

### Test the model

In [None]:
import pandas as pd
import pickle
import logging
import os
import numpy as np
import joblib

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/results_opt"
MODEL_PATH = os.path.join(OUTPUT_DIR, "ensemble_model_opt.pkl")
TFIDF_PATH = os.path.join(OUTPUT_DIR, "tfidf_vectorizer.pkl")
TEST_FILE_PATH = "/content/drive/MyDrive/Colab Notebooks/test_sample.xlsx"

# Mount Google Drive (for Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    os.makedirs(OUTPUT_DIR, exist_ok=True)
except ImportError:
    logger.info("Not running in Colab; skipping drive mount.")
    os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------- Text Processing Functions ----------------
def preprocess_text(text):
    return ' '.join(str(t).lower() for t in text if isinstance(t, str)) if isinstance(text, list) else str(text).lower()

def combine_text(df, text_cols):
    for col in text_cols:
        df[col] = df[col].apply(preprocess_text) if col in df else ""
    df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

# ---------------- Load Model and TF-IDF ----------------
def load_model():
    logger.info(f"Attempting to load model from {MODEL_PATH}")
    if not os.path.exists(MODEL_PATH):
        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}")

    saved_data = joblib.load(MODEL_PATH)

    if isinstance(saved_data, dict):
        logger.info("Loaded dictionary format.")
        model = saved_data.get("ensemble") or saved_data.get("model")
        tfidf = saved_data.get("tfidf") or saved_data.get("vectorizer")
        if model is None or tfidf is None:
            raise KeyError("Missing 'ensemble'/'model' or 'tfidf'/'vectorizer' in saved dictionary.")
    else:
        logger.info("Loaded model object directly. Trying separate TF-IDF.")
        model = saved_data
        if not os.path.exists(TFIDF_PATH):
            raise FileNotFoundError(f"TF-IDF vectorizer file not found at {TFIDF_PATH}")
        tfidf = joblib.load(TFIDF_PATH)

    return model, tfidf

# ---------------- Test with New Data ----------------
def test_new_data():
    logger.info(f"Loading test data from {TEST_FILE_PATH}")
    df_test = pd.read_excel(TEST_FILE_PATH, engine='openpyxl')

    logger.info("Columns in the test dataset:")
    logger.info(df_test.columns)

    sample_rows = df_test.head(30)

    required_cols = ["sentence", "topic", "article", "biased_words"]
    for col in required_cols:
        if col not in sample_rows.columns:
            sample_rows[col] = ""
    if 'type' not in sample_rows.columns:
        sample_rows['type'] = "Unknown"

    text_cols = ["sentence", "topic", "article", "biased_words"]
    sample_data_processed = combine_text(sample_rows, text_cols)

    ensemble, tfidf = load_model()

    logger.info("Transforming sample data to TF-IDF features")
    X_sample_tfidf = tfidf.transform(sample_data_processed["combined_input"])

    logger.info("Making predictions")
    pred_labels = ensemble.predict(X_sample_tfidf)
    pred_probs = ensemble.predict_proba(X_sample_tfidf)
    confidences = np.max(pred_probs, axis=1)

    for idx, (index, row) in enumerate(sample_rows.iterrows()):
        text = row["sentence"]
        true_label = row["type"]
        readable_label = REVERSE_LABEL_MAP[pred_labels[idx]]
        confidence = confidences[idx]
        print(f"\nText: {text[:100]}...")
        print(f"True Bias: {true_label} | Predicted: {readable_label} ({confidence:.2f})")

if __name__ == "__main__":
    test_new_data()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: left (0.45)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: left (0.45)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: left (0.45)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: left (0.45)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native ...
True Bias: right | Predicted: left (0.45)

Text: "orange is the new black" star yael stone is renouncing her u.s. green card to return to her native

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_rows[col] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(preprocess_text) if col in df else ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)


### Chunking method

In [None]:
import logging
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from xgboost import XGBClassifier
import torch


In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Check GPU availability
if torch.cuda.is_available():
    logger.info(f"GPU detected: {torch.cuda.get_device_name(0)}")
else:
    logger.warning("No GPU detected. Falling back to CPU.")

# Constants
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/results_opt"
LOGGING_DIR = "/content/drive/MyDrive/Colab Notebooks/logs_opt"
RESULTS_PATH = "/content/drive/MyDrive/Colab Notebooks/predictions_bias.csv"
MODEL_PATH = os.path.join(OUTPUT_DIR, "ensemble_model_bias.pkl")
CHUNK_SIZE = 10000  # Adjust based on memory constraints

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGGING_DIR, exist_ok=True)
os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)

# ---------------- Text Processing Functions ----------------
def preprocess_text(text):
    return ' '.join(str(t).lower() for t in text if isinstance(t, str)) if isinstance(text, list) else str(text).lower()

def combine_text(df, text_cols):
    for col in text_cols:
        df[col] = df[col].apply(preprocess_text) if col in df else ""
    df["combined_input"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

# ---------------- Load & Preprocess in Chunks ----------------
def load_and_preprocess_chunked(file_path, chunk_size=CHUNK_SIZE):
    logger.info("Loading and preprocessing data in chunks")
    text_cols = ["text", "topic", "article", "biased_words"]
    chunks = []

    reader = pd.read_csv(file_path,chunksize=chunk_size)
    for i, chunk in enumerate(reader):
        logger.info(f"Processing chunk {i+1}")
        chunk = chunk[chunk["type"].isin(LABEL_MAP.keys())]
        chunk_processed = combine_text(chunk, text_cols)
        chunk_processed["label"] = chunk_processed["type"].map(LABEL_MAP)
        chunks.append(chunk_processed)

    df_processed = pd.concat(chunks, ignore_index=True)

    logger.info("Data distribution:")
    logger.info(df_processed["type"].value_counts())

    return df_processed

# ---------------- Metrics ----------------
def compute_metrics(labels, preds):
    logger.info("Computing metrics")
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ---------------- Main Pipeline with Optimization ----------------
def main(file_path):
    logger.info("Starting main pipeline with chunked processing and GPU support")
    # Load and prepare data
    df = load_and_preprocess_chunked(file_path)

    # Train-Test Split
    logger.info("Splitting data")
    train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

    # Create a pipeline with TF-IDF with optimized features
    logger.info("Setting up pipeline with TF-IDF")
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2), min_df=5)

    # Transform text to TF-IDF features
    logger.info("Transforming training data to TF-IDF")
    X_train_tfidf = tfidf.fit_transform(train_df["combined_input"])
    X_eval_tfidf = tfidf.transform(eval_df["combined_input"])

    # Scale features for gradient descent-based models
    scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
    X_train_scaled = scaler.fit_transform(X_train_tfidf)
    X_eval_scaled = scaler.transform(X_eval_tfidf)

    # Use original balanced data for training
    logger.info("Training distribution:")
    logger.info(train_df["type"].value_counts())

    # Define individual models with hyperparameter tuning
    logger.info("Setting up ensemble models with gradient descent optimization")

    # Logistic Regression
    logistic = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)
    logistic_params = {
        'C': [0.01, 0.1, 1, 10],
        'max_iter': [1000, 2000, 3000],
    }
    logistic_search = RandomizedSearchCV(logistic, logistic_params, n_iter=5, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

    # Random Forest
    rf = RandomForestClassifier(random_state=42)
    rf_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    rf_search = RandomizedSearchCV(rf, rf_params, n_iter=5, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

    # XGBoost with GPU support
    xgb = XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42,
        tree_method='gpu_hist',  # Explicitly use GPU
        predictor='gpu_predictor'
    )
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
    xgb_search = RandomizedSearchCV(xgb, xgb_params, n_iter=5, cv=5, scoring='accuracy', random_state=42, n_jobs=1)

    # Train all models with hyperparameter tuning
    logger.info("Training Logistic Regression")
    logistic_search.fit(X_train_scaled, train_df["label"])
    logger.info(f"Best Logistic Regression params: {logistic_search.best_params_}")

    logger.info("Training Random Forest")
    rf_search.fit(X_train_scaled, train_df["label"])
    logger.info(f"Best Random Forest params: {rf_search.best_params_}")

    logger.info("Training XGBoost with GPU")
    xgb_search.fit(X_train_scaled, train_df["label"])
    logger.info(f"Best XGBoost params: {xgb_search.best_params_}")

    # Create ensemble model
    logger.info("Creating ensemble model")
    ensemble = VotingClassifier(
        estimators=[
            ('logistic', logistic_search.best_estimator_),
            ('random_forest', rf_search.best_estimator_),
            ('xgboost', xgb_search.best_estimator_)
        ],
        voting='soft'  # Use soft voting for probability-based weighting
    )
    ensemble.fit(X_train_scaled, train_df["label"])

    # Evaluate on validation set
    logger.info("Evaluating on validation set")
    val_preds = ensemble.predict(X_eval_scaled)
    metrics = compute_metrics(eval_df["label"], val_preds)
    logger.info(f"Validation metrics: {metrics}")

    # Save predictions
    logger.info("Saving predictions")
    eval_df["prediction"] = val_preds
    eval_df["predicted_type"] = eval_df["prediction"].map(REVERSE_LABEL_MAP)
    eval_df.to_csv(RESULTS_PATH, index=False)

    # Save final model
    logger.info(f"Saving final model to {MODEL_PATH}")
    joblib.dump({
        'model': ensemble,
        'tfidf': tfidf,
        'scaler': scaler,
        'label_map': LABEL_MAP
    }, MODEL_PATH)

    logger.info("Pipeline completed successfully")
    return metrics

if __name__ == "__main__":
    # Example usage (replace with your actual file path)
    file_path = "/content/drive/MyDrive/Colab Notebooks/balanced_data.csv"
    main(file_path)


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", dev

In [None]:
import joblib
import logging
import os
import numpy as np

# ---------------- Set Up Logging ----------------
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ---------------- Constants ----------------
LABEL_MAP = {"left": 0, "center": 1, "right": 2}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/results_opt"
MODEL_PATH = os.path.join(OUTPUT_DIR, "ensemble_model_bias.pkl")

# ---------------- Mount Google Drive (Colab) ----------------
try:
    from google.colab import drive
    drive.mount('/content/drive')
    os.makedirs(OUTPUT_DIR, exist_ok=True)
except ImportError:
    logger.info("Not running in Colab; skipping drive mount.")
    os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------- Text Preprocessing ----------------
def preprocess_text(text):
    return str(text).lower()

# ---------------- Load Model ----------------
def load_model():
    logger.info(f"Attempting to load model and TF-IDF from {MODEL_PATH}")
    if not os.path.exists(MODEL_PATH):
        logger.error(f"Model file not found at {MODEL_PATH}. Please upload the file.")
        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}.")

    saved_data = joblib.load(MODEL_PATH)

    ensemble = saved_data.get('model') or saved_data.get('ensemble')
    tfidf = saved_data['tfidf']

    return ensemble, tfidf

# ---------------- Test Model with Raw Text ----------------
def test_model():
    sample_texts= [
        ["""As wildfires rage across California, floods displace thousands in the Midwest, and heatwaves scorch cities from Texas to New York, the evidence is undeniable: the climate crisis is no longer a distant threat—it’s here. And yet, as communities suffer and ecosystems collapse, fossil fuel corporations continue to post record-breaking profits, protected by conservative politicians and a global system rigged in their favor.
        In 2024 alone, the five largest oil companies reported over $200 billion in profits. Instead of investing in renewable energy or helping vulnerable communities transition to a green economy, these corporations funneled billions into stock buybacks and executive bonuses. Their message is clear: profits come before people, and the planet can burn so long as the shareholders stay rich.
        Even more alarming is the political shielding they receive from right-wing lawmakers, many of whom deny climate science altogether. Republican leaders in Congress have repeatedly blocked climate legislation, gutted the Environmental Protection Agency’s regulatory powers, and prioritized drilling permits over clean air and water.
        Meanwhile, climate activists—many of them youth, Indigenous leaders, and marginalized communities—continue to face police repression, surveillance, and criminalization. Peaceful protesters at pipeline sites are arrested, while oil spills and environmental destruction go unpunished.
        We need a Green New Deal-level transformation: bold investments in wind, solar, and green infrastructure; the creation of millions of unionized green jobs; and climate reparations for communities hit hardest by pollution and environmental racism.
        The time for delay is over. The time to act is now."""
        ],
        ["""The United States thrives when government steps back and lets free enterprise lead. In recent years, however, progressive lawmakers have increasingly pushed for regulation, redistribution, and intervention that stifles innovation and discourages hard work.
        From overreaching environmental mandates to government-controlled healthcare proposals, the left continues to champion policies that prioritize bureaucracy over results. These moves are not only anti-business—they’re anti-American.
        America's economic engine runs best when the private sector is free to create, compete, and grow. Small business owners across the country are already struggling with inflation and labor shortages—problems worsened by excessive government interference and rising taxes.
        We must return to policies that reward productivity, protect property rights, and uphold free-market values. Deregulation, tax reform, and energy independence will not only restore our economy—they’ll renew our national spirit.
        """],
        ["""As artificial intelligence tools become increasingly integrated into everyday life—from health diagnostics to criminal justice systems—Democratic and Republican lawmakers alike are recognizing the need for clear regulatory frameworks.
        A bipartisan group in Congress recently introduced the American AI Responsibility Act, aiming to address transparency, data privacy, and algorithmic bias. While the bill doesn’t go as far as some activists demand, it marks an important step toward balancing innovation with accountability.
        Tech CEOs have expressed cautious support, stating that some regulation is needed to maintain public trust, but they warn against overregulation that could drive development offshore.
        Experts agree: regulation must be careful, measured, and informed by the science—not by political theater. While divisions remain, the shared concern over AI’s risks may offer a rare opportunity for consensus in Washington.
        """],
        ["""In yet another blow to working-class Americans, Senate Republicans have blocked legislation that would raise the federal minimum wage to $17 per hour by 2027. With wages stagnant and inflation hitting food, rent, and transportation costs, the move is being widely condemned by labor leaders and economists.
        The current $7.25 minimum wage has not been raised since 2009, despite historic gains in productivity and corporate profits. Over 60% of Americans support a raise, but Republican lawmakers claim it would “hurt small businesses”—an argument that many economists say is overblown.
        In reality, the refusal to raise wages preserves exploitative systems where billion-dollar corporations rely on underpaid workers while CEO salaries skyrocket.
        This is not just about economics—it’s about dignity. Every American who works full-time should be able to afford basic necessities. Congress’s failure to act is a moral failure, and it’s up to voters to hold them accountable."""],
        ["""The southern border has long been a flashpoint in American politics, but recent data shows that tougher enforcement and advanced surveillance technology are yielding results. Illegal crossings dropped 30% in the first quarter of 2025 compared to the previous year, according to Homeland Security reports.
        Under the new measures, authorities have deployed AI-powered drones, reinforced border fencing, and accelerated asylum screening procedures. Critics on the left say the policies are “inhumane,” but officials argue they are necessary to protect national sovereignty and public safety.
        Drug seizures have also increased, particularly fentanyl shipments originating from cartels that exploit weak border points. Law enforcement agencies say the new tools and funding are making a significant impact.
        The Biden administration was slow to act early in its term, but this policy shift marks a necessary correction. The right to immigrate must be balanced with the rule of law—and American citizens deserve to feel safe and secure in their own country.
        """]
        ]

    # Load model and TF-IDF
    ensemble, tfidf = load_model()

    # Process and predict
    for text in sample_texts:
        processed_text = preprocess_text(text)
        tfidf_input = tfidf.transform([processed_text])

        predicted_label = ensemble.predict(tfidf_input)[0]
        predicted_proba = ensemble.predict_proba(tfidf_input)[0]
        confidence = np.max(predicted_proba)

        readable_label = REVERSE_LABEL_MAP[predicted_label]

        print(f"\nText: {text[:100]}...")
        print(f"Predicted Bias: {readable_label} (Confidence: {confidence:.2f})")

# ---------------- Main Execution ----------------
if __name__ == "__main__":
    test_model()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.



Text: ['As wildfires rage across California, floods displace thousands in the Midwest, and heatwaves scorch cities from Texas to New York, the evidence is undeniable: the climate crisis is no longer a distant threat—it’s here. And yet, as communities suffer and ecosystems collapse, fossil fuel corporations continue to post record-breaking profits, protected by conservative politicians and a global system rigged in their favor.\n        In 2024 alone, the five largest oil companies reported over $200 billion in profits. Instead of investing in renewable energy or helping vulnerable communities transition to a green economy, these corporations funneled billions into stock buybacks and executive bonuses. Their message is clear: profits come before people, and the planet can burn so long as the shareholders stay rich.\n        Even more alarming is the political shielding they receive from right-wing lawmakers, many of whom deny climate science altogether. Republican leaders in Congress h