In [1]:
import pandas as pd
import numpy as np
import torch
import torchvision.transforms as torchvision_transforms
from torchvision.models import resnet18
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch.optim as optim
import os
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
%%capture
!pip install wandb --upgrade

In [3]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msamuelvasserman[0m ([33mllm-news-detector[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
config = dict(
    epochs=2,
    learning_rate=0.0001,
    batch_size=16,
    device="cuda" if torch.cuda.is_available() else "cpu",
    ai_dataset_path=["./newsgpt_dataset.csv", "./rewritten_AI_data.csv"],
    real_dataset_path=["./cnn_dataset.csv"],
    ai_img_dir="./newsgpt_images",
    real_img_dir="./cnn_images",
)

# Model 🤖



In [5]:
class MultiModalModel(nn.Module):
    def __init__(self):
        super(MultiModalModel, self).__init__()
        # Load pre-trained models
        self.resnet = resnet18(pretrained=True)
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Freeze the ResNet parameters
        for param in self.resnet.parameters():
            param.requires_grad = False

        # Freeze the BERT parameters
        for param in self.bert.parameters():
            param.requires_grad = False

        # Define concatonated layers
        self.multi_modal_layers = nn.Sequential(
            nn.Linear(in_features=self.resnet.fc.out_features + self.bert.config.hidden_size, out_features=512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, image_inputs, text_inputs):
        # Process image input
        image_features = self.resnet(image_inputs)
        image_features = torch.flatten(image_features, 1)  # Flatten the features

        # Process text input
        text_features = self.bert(**text_inputs).last_hidden_state[:, 0, :]  # Get the [CLS] token's features

        # Concatenate features
        combined_features = torch.cat((image_features, text_features), dim=1)

        # Pass through additional layers
        output = self.multi_modal_layers(combined_features)

        output_binary = self.sigmoid(output)

        return output_binary

# Data 📊


In [6]:
class MultiModalDataset(Dataset):
    def __init__(self, dataframe, ai_img_dir, real_img_dir, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.text_idx = dataframe.columns.get_loc('Text')
        self.title_idx = dataframe.columns.get_loc('Title')
        self.image_idx = dataframe.columns.get_loc('Image')
        self.label_idx = dataframe.columns.get_loc('Label')
        self.ai_img_dir = ai_img_dir
        self.real_img_dir = real_img_dir

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = self.dataframe.iloc[idx, self.text_idx]
        title = self.dataframe.iloc[idx, self.title_idx]
        label = self.dataframe.iloc[idx, self.label_idx]

        img_folder = self.ai_img_dir if label == 1 else self.real_img_dir
        img_name = os.path.join(img_folder, str(self.dataframe.iloc[idx, self.image_idx]))
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return title, text, image, label

In [7]:
class ImageDataset(Dataset):
    """
    create image dataset for loading training images and calculating mean and std of normalization
    for image transforms in the MMM

    input: dataframe with Image, Label columns
    """
    def __init__(self, dataframe, ai_img_dir, real_img_dir, transform=None):
        self.dataframe = dataframe
        self.ai_img_dir = ai_img_dir
        self.real_img_dir = real_img_dir
        self.image_idx = dataframe.columns.get_loc('Image')
        self.label_idx = dataframe.columns.get_loc('Label')
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):

        label = self.dataframe.iloc[idx, self.label_idx]

        img_folder = self.ai_img_dir if label == 1 else self.real_img_dir
        img_name = os.path.join(img_folder, str(self.dataframe.iloc[idx, self.image_idx]))
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image

In [8]:
import torch
from torchvision import transforms as torchvision_transforms

def get_mean_std(loader):
    # Variables to accumulate the sum and sum of squares
    channel_sum, channel_sum_squared, num_batches = 0, 0, 0

    for images in loader:
        # Assumes images are of shape (batch_size, num_channels, height, width)
        channel_sum += torch.mean(images, dim=[0, 2, 3])
        channel_sum_squared += torch.mean(images**2, dim=[0, 2, 3])
        num_batches += 1

    # Calculate the mean and std dev
    mean = channel_sum / num_batches
    # std = sqrt(E[X^2] - (E[X])^2)
    std = (channel_sum_squared / num_batches - mean ** 2) ** 0.5

    return mean, std

def get_normalization_values(dataframe, ai_img_dir, real_img_dir):
    transforms = torchvision_transforms.Compose([
        torchvision_transforms.ToTensor(),
        torchvision_transforms.Resize((224, 224)),
        torchvision_transforms.CenterCrop(224),
    ])

    # Assuming ImageDataset is defined elsewhere and correctly handles the dataframe and directories
    dataset = ImageDataset(dataframe, ai_img_dir, real_img_dir, transforms)

    batch_size = 32
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    mean, std = get_mean_std(loader)
    return mean, std


In [9]:
def get_data(ai_dataset_path, real_dataset_path, ai_img_dir, real_img_dir):

    # get the datasets

    ai_data = pd.DataFrame()
    for dataset in ai_dataset_path:
      ai_data = pd.concat([pd.read_csv(dataset), ai_data])

    real_data = pd.DataFrame()
    for dataset in real_dataset_path:
      real_data = pd.concat([pd.read_csv(dataset), real_data])

    combined_data = pd.concat([ai_data, real_data])

    # shuffle dataset before hand
    combined_data_shuffled = combined_data.sample(frac=1).reset_index(drop=True)

    mean, std = get_normalization_values(combined_data[['Image', 'Label']], ai_img_dir, real_img_dir)

    print(mean, std)

    transform = torchvision_transforms.Compose([
        torchvision_transforms.Resize(256),
        torchvision_transforms.CenterCrop(224),
        torchvision_transforms.ToTensor(),
        torchvision_transforms.Normalize(mean=mean, std=std),
    ])

    # create dataset class
    dataset = MultiModalDataset(
        dataframe=combined_data,
        ai_img_dir=ai_img_dir,
        real_img_dir=real_img_dir,
        transform=transform
    )

    # Split the dataset
    train_size = int(0.75 * len(dataset)) #60%
    val_size = int(0.10 * len(dataset)) #30%
    test_size = len(dataset) - train_size - val_size #10%

    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    return train_dataset, val_dataset, test_dataset

# Training 👟

In [10]:
def train(model, train_loader, val_loader, criterion, optimizer, config):

    wandb.watch(model, criterion, log="all", log_freq=10)

    # Run training and track with wandb
    total_batches = len(train_loader) * config["epochs"]
    example_ct = 0  # number of examples seen
    batch_ct = 0
    for epoch in tqdm(range(config["epochs"])):
        for title, text, image, label in train_loader:
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            text = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)

            loss = train_batch(text, image, label, model, optimizer, criterion)
            example_ct += len(image)
            batch_ct += 1

            # Report metrics every 3rd batch
            if ((batch_ct + 1) % 3) == 0:
                train_log(loss, example_ct, epoch)

        # run validation every epoch
        test(model, val_loader, epoch=epoch)


def train_batch(text, image, label, model, optimizer, criterion):
    text, image, label = text.to(config["device"]), image.to(config["device"]), label.to(config["device"])

    # Forward pass ➡
    output = model(image, text)
    loss = criterion(torch.squeeze(output, 1), label.float())

    # Backward pass ⬅
    optimizer.zero_grad()
    loss.backward()

    # Step with optimizer
    optimizer.step()

    return loss

In [11]:
def train_log(loss, example_ct, epoch):
    wandb.log({"epoch": epoch, "loss": loss}, step=example_ct)
    print(f"Loss after {str(example_ct).zfill(5)} examples: {loss:.3f}")

# Testing 🧪

In [12]:
def test(model, loader, epoch=None):
    model.eval()

    all_labels = []
    all_predictions = []

    # Run the model on some test examples
    with torch.no_grad():
        correct, total = 0, 0
        for title, text, image, label in loader:
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            text = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)

            text, image, label = text.to(config["device"]), image.to(config["device"]), label.to(config["device"])
            output = model(image, text)
            predicted = (output.data > 0.5).long()
            total += label.size(0)
            correct += (predicted == label).sum().item()

            all_labels.extend(label.cpu().numpy())
            flattened_predictions = [pred[0] for pred in predicted.cpu().numpy().tolist()]
            all_predictions.extend(flattened_predictions)

    print(all_labels, all_predictions)
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, zero_division=0)
    recall = recall_score(all_labels, all_predictions, zero_division=0)
    f1 = f1_score(all_labels, all_predictions, zero_division=0)

    print(f"Accuracy of the model on the {total} test examples: {accuracy:.2%}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    # Log metrics
    if epoch:
      wandb.log({"val_accuracy": accuracy, "val_precision": precision, "val_recall": recall, "val_f1_score": f1, "epoch": epoch})
    else:
      wandb.log({"test_accuracy": accuracy, "test_precision": precision, "test_recall": recall, "test_f1_score": f1})

    # Save the model
    #wandb.save("mmm.onnx")


# Pipeline 😎

In [13]:
def model_pipeline(config):

  with wandb.init(project="multi-modal model", config=config):
      # make the model, data, and optimization problem
      config = wandb.config

      print(config)

      model, train_loader, val_loader, test_loader, criterion, optimizer = make(config)
      #print(model)

      # and use them to train the model
      train(model, train_loader, val_loader, criterion, optimizer, config)

      # and test its final performance
      test(model, test_loader)

      torch.save(model.state_dict(), '/content/drive/MyDrive/model_saved.pth')

      return model

In [14]:
def make(config):
    # Make the data
    train, val, test = get_data(
        config["ai_dataset_path"],
        config["real_dataset_path"],
        config["ai_img_dir"],
        config["real_img_dir"],
    )

    train_loader = DataLoader(train, batch_size=config["batch_size"], shuffle=True)
    val_loader = DataLoader(val, batch_size=config["batch_size"], shuffle=True)
    test_loader = DataLoader(test, batch_size=config["batch_size"], shuffle=True)

    # Make the model
    model = MultiModalModel().to(config["device"])

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])

    return model, train_loader, val_loader, test_loader, criterion, optimizer

In [15]:
device = torch.device(config["device"])

print("running on " + config["device"])

model_pipeline(config)

running on cuda


{'epochs': 2, 'learning_rate': 0.0001, 'batch_size': 16, 'device': 'cuda', 'ai_dataset_path': ['./newsgpt_dataset.csv', './rewritten_AI_data.csv'], 'real_dataset_path': ['./cnn_dataset.csv'], 'ai_img_dir': './newsgpt_images', 'real_img_dir': './cnn_images'}
                                                 Title  \
0    Miriam Margolyes Expresses Concern Over Adult ...   
1    Poseidona: Transforming Invasive Seaweed into ...   
2    Trump Considers National Abortion Ban at 16 Weeks   
3    Jared Kushner’s $500M Belgrade Hotel Deal: Ech...   
4    Global St. Patrick’s Day Celebrations: Dublin,...   
..                                                 ...   
175  Devastating Tornadoes Strike Eastern Indiana a...   
176  Marc Fogel, American Teacher Imprisoned in Rus...   
177  Chelsea WSL Team Eyes Historic Quadruple Amids...   
178  West Ham’s Victory Boosts England’s Chances fo...   
179  Rishi Sunak Faces Voter Backlash, Described as...   

                                             

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  0%|          | 0/2 [00:00<?, ?it/s]

Loss after 00032 examples: 0.711
Loss after 00080 examples: 0.682
Loss after 00128 examples: 0.550
Loss after 00176 examples: 0.509
Loss after 00224 examples: 0.525
Loss after 00272 examples: 0.453
Loss after 00320 examples: 0.556
Loss after 00368 examples: 0.548
Loss after 00416 examples: 0.441
Loss after 00456 examples: 0.380


 50%|█████     | 1/2 [00:46<00:46, 46.42s/it]

[1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0]
Accuracy of the model on the 60 test examples: 88.33%
Precision: 0.93
Recall: 0.84
F1 Score: 0.89
Loss after 00504 examples: 0.306
Loss after 00552 examples: 0.293
Loss after 00600 examples: 0.272
Loss after 00648 examples: 0.192
Loss after 00696 examples: 0.159
Loss after 00744 examples: 0.213
Loss after 00792 examples: 0.234
Loss after 00840 examples: 0.192
Loss after 00888 examples: 0.114


100%|██████████| 2/2 [01:30<00:00, 45.06s/it]

[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1] [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1]
Accuracy of the model on the 60 test examples: 93.33%
Precision: 0.94
Recall: 0.94
F1 Score: 0.94





[1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1] [1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1]
Accuracy of the model on the 93 test examples: 93.55%
Precision: 0.97
Recall: 0.94
F1 Score: 0.95


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁█████████
loss,██▆▆▆▅▆▆▅▄▃▃▃▂▂▂▂▂▁
test_accuracy,▁█
test_f1_score,▁█
test_precision,▁█
test_recall,▁█
val_accuracy,▁
val_f1_score,▁
val_precision,▁
val_recall,▁

0,1
epoch,1.0
loss,0.1141
test_accuracy,0.93548
test_f1_score,0.95082
test_precision,0.96667
test_recall,0.93548
val_accuracy,0.93333
val_f1_score,0.9375
val_precision,0.9375
val_recall,0.9375


MultiModalModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tr