<a href="https://colab.research.google.com/github/samvass/capstone-llm-news-detector/blob/master/models/multi-modal/xgboost_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://forecastegy.com/posts/xgboost-binary-classification-python/

# Config ⚙️

In [2]:
import pandas as pd
import numpy as np
import torch
import torchvision.transforms as torchvision_transforms
from torchvision.models import resnet18
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch.optim as optim
import os
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from joblib import dump

In [3]:
config = dict(
    epochs=2,
    learning_rate=0.0001,
    batch_size=16,
    device="cuda" if torch.cuda.is_available() else "cpu",
    ai_dataset_path=["./newsgpt_dataset.csv", "./rewritten_AI_data.csv"],
    real_dataset_path=["./cnn_dataset.csv"],
    ai_img_dir="./newsgpt_images",
    real_img_dir="./cnn_images",
)

# Feature Extractor 🤖

In [4]:
class MultiModalModelFeatureExtractor(nn.Module):
    def __init__(self):
        super(MultiModalModelFeatureExtractor, self).__init__()
        self.resnet = resnet18(pretrained=True)
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Freeze the ResNet parameters
        for param in self.resnet.parameters():
            param.requires_grad = False

        # Freeze the BERT parameters
        for param in self.bert.parameters():
            param.requires_grad = False

    def forward(self, image_inputs, text_inputs):
        # Process image input
        image_features = self.resnet(image_inputs)
        image_features = torch.flatten(image_features, 1)  # Flatten the features

        # Process text input
        text_features = self.bert(**text_inputs).last_hidden_state[:, 0, :]  # Get the [CLS] token's features

        # Concatenate features
        combined_features = torch.cat((image_features, text_features), dim=1)

        return combined_features

# Data 📊

In [5]:
class MultiModalDataset(Dataset):
    def __init__(self, dataframe, ai_img_dir, real_img_dir, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.text_idx = dataframe.columns.get_loc('Text')
        self.title_idx = dataframe.columns.get_loc('Title')
        self.image_idx = dataframe.columns.get_loc('Image')
        self.label_idx = dataframe.columns.get_loc('Label')
        self.ai_img_dir = ai_img_dir
        self.real_img_dir = real_img_dir

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = self.dataframe.iloc[idx, self.text_idx]
        title = self.dataframe.iloc[idx, self.title_idx]
        label = self.dataframe.iloc[idx, self.label_idx]

        img_folder = self.ai_img_dir if label == 1 else self.real_img_dir
        img_name = os.path.join(img_folder, str(self.dataframe.iloc[idx, self.image_idx]))
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return title, text, image, label

In [6]:
class ImageDataset(Dataset):
    """
    create image dataset for loading training images and calculating mean and std of normalization
    for image transforms in the MMM

    input: dataframe with Image, Label columns
    """
    def __init__(self, dataframe, ai_img_dir, real_img_dir, transform=None):
        self.dataframe = dataframe
        self.ai_img_dir = ai_img_dir
        self.real_img_dir = real_img_dir
        self.image_idx = dataframe.columns.get_loc('Image')
        self.label_idx = dataframe.columns.get_loc('Label')
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):

        label = self.dataframe.iloc[idx, self.label_idx]

        img_folder = self.ai_img_dir if label == 1 else self.real_img_dir
        img_name = os.path.join(img_folder, str(self.dataframe.iloc[idx, self.image_idx]))
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image

In [7]:
import torch
from torchvision import transforms as torchvision_transforms

def get_mean_std(loader):
    # Variables to accumulate the sum and sum of squares
    channel_sum, channel_sum_squared, num_batches = 0, 0, 0

    for images in loader:
        # Assumes images are of shape (batch_size, num_channels, height, width)
        channel_sum += torch.mean(images, dim=[0, 2, 3])
        channel_sum_squared += torch.mean(images**2, dim=[0, 2, 3])
        num_batches += 1

    # Calculate the mean and std dev
    mean = channel_sum / num_batches
    # std = sqrt(E[X^2] - (E[X])^2)
    std = (channel_sum_squared / num_batches - mean ** 2) ** 0.5

    return mean, std

def get_normalization_values(dataframe, ai_img_dir, real_img_dir):
    transforms = torchvision_transforms.Compose([
        torchvision_transforms.ToTensor(),
        torchvision_transforms.Resize((224, 224)),
        torchvision_transforms.CenterCrop(224),
    ])

    # Assuming ImageDataset is defined elsewhere and correctly handles the dataframe and directories
    dataset = ImageDataset(dataframe, ai_img_dir, real_img_dir, transforms)

    batch_size = 32
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    mean, std = get_mean_std(loader)
    return mean, std

In [8]:
def get_data(ai_dataset_path, real_dataset_path, ai_img_dir, real_img_dir):

    # get the datasets

    ai_data = pd.DataFrame()
    for dataset in ai_dataset_path:
      ai_data = pd.concat([pd.read_csv(dataset), ai_data])

    real_data = pd.DataFrame()
    for dataset in real_dataset_path:
      real_data = pd.concat([pd.read_csv(dataset), real_data])

    combined_data = pd.concat([ai_data, real_data])

    mean, std = get_normalization_values(combined_data[['Image', 'Label']], ai_img_dir, real_img_dir)

    print(mean, std)

    transform = torchvision_transforms.Compose([
        torchvision_transforms.Resize(256),
        torchvision_transforms.CenterCrop(224),
        torchvision_transforms.ToTensor(),
        torchvision_transforms.Normalize(mean=mean, std=std),
    ])

    # create dataset class
    dataset = MultiModalDataset(
        dataframe=combined_data,
        ai_img_dir=ai_img_dir,
        real_img_dir=real_img_dir,
        transform=transform
    )

    return dataset

# Testing 🧪

In [None]:
def test(model, X_test, y_test):
  accuracy = model.score(X_test, y_test)
  print("Accuracy: %.2f%%" % (accuracy * 100.0))
  y_pred_proba = model.predict_proba(X_test)[:, 1]

  # Calculate log loss
  logloss = log_loss(y_test, y_pred_proba)
  print(f"Log Loss: {logloss}")

  # Calculate ROC AUC Score
  roc_auc = roc_auc_score(y_test, y_pred_proba)
  print(f"ROC AUC Score: {roc_auc}")

# Pipeline 😎

In [None]:
def make():
  dataset = get_data(
        config["ai_dataset_path"],
        config["real_dataset_path"],
        config["ai_img_dir"],
        config["real_img_dir"],
      )

  data_loader = DataLoader(dataset, batch_size=config["batch_size"])
  return data_loader

In [None]:
def model_pipeline(config):
  data_loader = make(config)

  feature_extractor = MultiModalModelFeatureExtractor()

  features_list = []
  labels_list = []

  # gather x, y values
  for _, texts, images, labels in data_loader:
    combined_features = feature_extractor(images, texts)
    features_list.append(combined_features)
    labels_list.append(labels.numpy())

  X = np.vstack(features_list)
  y = np.vstack(labels_list)

  # split data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # training
  model = XGBClassifier(objective='binary:logistic')
  model.fit(X_train, y_train)

  # testing
  test(model, X_test, y_test)

  #save the model
  dump(model, '/content/drive/MyDrive/xgb_model.joblib')