In [8]:
import pandas as pd
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet18
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch.optim as optim
import os
from PIL import Image
from tqdm import tqdm

In [9]:
class MultiModalModel(nn.Module):
    def __init__(self):
        super(MultiModalModel, self).__init__()
        # Load pre-trained models
        self.resnet = resnet18(pretrained=True)
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Define concatonated layers
        self.multi_modal_layers = nn.Sequential(
            nn.Linear(in_features=self.resnet.fc.out_features + self.bert.config.hidden_size, out_features=512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, image_inputs, text_inputs):
        # Process image input
        image_features = self.resnet(image_inputs)
        image_features = torch.flatten(image_features, 1)  # Flatten the features

        # Process text input
        text_features = self.bert(**text_inputs).last_hidden_state[:, 0, :]  # Get the [CLS] token's features

        # Concatenate features
        combined_features = torch.cat((image_features, text_features), dim=1)

        # Pass through additional layers
        output = self.multi_modal_layers(combined_features)

        output_binary = self.sigmoid(output)

        return output

In [10]:
class MultiModalDataset(Dataset):
    def __init__(self, dataframe, ai_img_dir, real_img_dir, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.text_idx = dataframe.columns.get_loc('Text')
        self.title_idx = dataframe.columns.get_loc('Title')
        self.image_idx = dataframe.columns.get_loc('Image')
        self.label_idx = dataframe.columns.get_loc('Label')
        # self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.ai_img_dir = ai_img_dir
        self.real_img_dir = real_img_dir

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = self.dataframe.iloc[idx, self.text_idx]
        title = self.dataframe.iloc[idx, self.title_idx]
        label = self.dataframe.iloc[idx, self.label_idx]

        img_folder = self.ai_img_dir if label == 1 else self.real_img_dir
        img_name = os.path.join(img_folder, str(self.dataframe.iloc[idx, self.image_idx]))
        image = Image.open(img_name).convert('RGB')

        # only tockenizing text for now
        # text = self.tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)

        if self.transform:
            image = self.transform(image)

        return title, text, image, label

In [11]:
# load data
def load_data(ai_dataset_path, real_dataset_path, ai_img_dir, real_img_dir):

    # get the datasets
    ai_data = pd.read_csv(ai_dataset_path)
    real_data = pd.read_csv(real_dataset_path)
    
    combined_data = pd.concat([ai_data, real_data])

    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # create dataset class
    dataset = MultiModalDataset(
        dataframe=combined_data,
        ai_img_dir=ai_img_dir,
        real_img_dir=real_img_dir,
        transform=transform
    )
    
    # Split the dataset
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    return train_dataset, val_dataset, test_dataset

In [12]:
train, val, test = load_data(
    ai_dataset_path='../../data/ai_scraping/newsgpt_dataset.csv',
    real_dataset_path='../../data/real_scraping/cnn_dataset.csv',
    ai_img_dir="../../data/ai_scraping/newsgpt_images",
    real_img_dir="../../data/real_scraping/cnn_images"
    )

# setup data loaders
train_loader = DataLoader(train, batch_size=32, shuffle=True)
val_loader = DataLoader(val, batch_size=32, shuffle=False)
test_loader = DataLoader(test, batch_size=32, shuffle=False)

In [13]:
# training
model = MultiModalModel() # load model

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)



MultiModalModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tr

In [None]:
num_epochs = 2
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for title, text, image, label in tqdm(train_loader, desc=f'Train: epoch {epoch+1}/{num_epochs}'):
        optimizer.zero_grad()
        text = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
        outputs = model(image, text)
        loss = criterion(torch.squeeze(outputs), label.float())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    with torch.no_grad():  # No gradients needed for validation, saves memory and computations
        model.eval()  # Set the model to evaluation mode
    
        val_loss = 0.0
        correct = 0
        total = 0
    
        for title, text, image, label in tqdm(val_loader, desc=f'Valid: epoch {epoch+1}/{num_epochs}'):
            text = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
            outputs = model(image, text)
            
            loss = criterion(torch.squeeze(outputs), label.float())
            val_loss += loss.item()
            
            # Convert outputs to predictions for calculating accuracy
            # The specific conversion depends on your task and model output
            # Here's a generic example for binary classification
            preds = torch.round(outputs)  # Assuming binary classification
            
            # Calculate accuracy
            correct += (preds.squeeze() == label).sum().item()
            total += label.size(0)
    
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct / total
    
        print(f'Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

Train: epoch 1/2:   0%|                                  | 0/11 [00:00<?, ?it/s]