In [2]:
from collections import Counter
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models, transforms
from tqdm import tqdm
from PIL import Image
import re

# Set device to CPU
device = torch.device('cpu')

# Load data from JSON files
train_data = json.load(open('train279_2a.json', encoding="utf8"))
valid_data = json.load(open('validation.json', encoding="utf8"))
test_data = json.load(open('dev_unlabeled.json', encoding="utf8"))

# Define image data paths
IMAGE_DATA_TRAIN = 'train_images/'
IMAGE_DATA_VALID = 'validation_images/'

# Define label names
LABELS = [
    'Causal Oversimplification',
    'Transfer',
    'Flag-waving',
    'Black-and-white Fallacy/Dictatorship',
    'Smears',
    'Loaded Language',
    'Glittering generalities (Virtue)',
    'Thought-terminating cliché',
    'Whataboutism',
    'Slogans',
    'Doubt',
    'Name calling/Labeling',
    'Repetition',
    'Appeal to authority',
    'Appeal to (Strong) Emotions',
    'Reductio ad hitlerum',
    'Appeal to fear/prejudice',
    'Exaggeration/Minimisation',
    'Misrepresentation of Someone\'s Position (Straw Man)',
    'Obfuscation, Intentional vagueness, Confusion',
    'Bandwagon',
    'Presenting Irrelevant Data (Red Herring)'
]

# Define dataset class
class ModelDataSet(Dataset):
    def __init__(self, tokenizer, max_length, data, image_path):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.image_path = image_path
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        image_path = self.image_path + self.data[index]['image']
        img = Image.open(image_path).convert('RGB')  # Ensure image is RGB
        preprocess = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        img_tensor = preprocess(img)
        
        text = self.data[index]['text']
        text = clean_text(text)
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'image': img_tensor,
        }

# Define BERT tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

# Create dataset instances
train_dataset = ModelDataSet(tokenizer, max_length=512, data=train_data, image_path=IMAGE_DATA_TRAIN)
valid_dataset = ModelDataSet(tokenizer, max_length=512, data=valid_data, image_path=IMAGE_DATA_VALID)

# Define dataloaders
train_dataloader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=4, shuffle=False)

# Define the model architecture
class MultiModalClassifier(nn.Module):
    def __init__(self, num_classes):
        super(MultiModalClassifier, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.resnet_model = models.resnet50(pretrained=True)
        self.resnet_model.fc = nn.Identity()  # Remove the final classification layer of ResNet

        self.txt_dense1 = nn.Linear(768, 256)
        self.img_dense1 = nn.Linear(2048, 256)
        self.concat_dense2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(p=0.4)
        self.bn1 = nn.BatchNorm1d(256)
        self.bn2 = nn.BatchNorm1d(num_classes)

    def forward(self, ids, mask, token_type_ids, img_input):
        _, bert_output = self.bert_model(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        txt_repr = self.dropout(F.relu(self.bn1(self.txt_dense1(bert_output))))

        img_repr = self.resnet_model(img_input)
        img_repr = self.dropout(F.relu(self.bn1(self.img_dense1(img_repr))))
        
        combined_repr = torch.cat((txt_repr, img_repr), dim=1)
        combined_repr = self.dropout(F.relu(self.concat_dense2(combined_repr)))
        combined_repr = self.bn1(combined_repr) 
        return F.softmax(self.bn2(combined_repr), dim=1)

# Create model instance
model = MultiModalClassifier(num_classes=len(LABELS))

# Define loss function and optimizer
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [10]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models, transforms
from tqdm import tqdm
from PIL import Image
import re
from sklearn.preprocessing import MultiLabelBinarizer

device = torch.device('cpu')

# Specify data paths and other configurations
IMAGE_DATA_TRAIN = 'train_images/train_images/'
IMAGE_DATA_VALID = 'validation_images/validation_images/'
JSON_DATA_TRAIN = 'train279_2a.json'
JSON_DATA_VALID = 'validation.json'
JSON_DATA_TEST = 'dev_unlabeled.json'

# Define labels
LABELS = {
    'Appeal to (Strong) Emotions': 0,
    'Appeal to authority': 1,
    'Appeal to fear/prejudice': 2,
    'Bandwagon': 3,
    'Black-and-white Fallacy/Dictatorship': 4,
    'Causal Oversimplification': 5,
    'Doubt': 6,
    'Exaggeration/Minimisation': 7,
    'Flag-waving': 8,
    'Glittering generalities (Virtue)': 9,
    'Loaded Language': 10,
    "Misrepresentation of Someone's Position (Straw Man)": 11,
    'Name calling/Labeling': 12,
    'Obfuscation, Intentional vagueness, Confusion': 13,
    'Presenting Irrelevant Data (Red Herring)': 14,
    'Reductio ad hitlerum': 15,
    'Repetition': 16,
    'Slogans': 17,
    'Whataboutism': 18,
    'Thought-terminating cliché': 19,
    'Transfer': 20,
    'Smears': 21
}

# Load data from JSON files
train_data = json.load(open(JSON_DATA_TRAIN, encoding="utf8"))
valid_data = json.load(open(JSON_DATA_VALID, encoding="utf8"))
test_data = json.load(open(JSON_DATA_TEST, encoding="utf8"))

# Data Preprocessing
def clean_text(text):
    text = text.replace('\\n', ' ').lower()
    text = re.sub(r'[^\\w\\s]', "", text)
    text = re.sub(r'\\s+', ' ', text).strip()
    return text

# Define Dataset class
class ModelDataSet(Dataset):
    def __init__(self, tokenizer, max_length, data, image_path):
        super(ModelDataSet, self).__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.image_path = image_path
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        image_name = self.data[index]['image']
        img = Image.open(self.image_path + image_name)
        preprocess = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
        ])
        img_tensor = preprocess(img)
        if img_tensor.shape[0] > 3 :
            img_tensor = img_tensor[:3, :, :]
        elif img_tensor.shape[0] < 3:
            img_tensor= img_tensor.expand(3, -1, -1)
        img_tensor = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(img_tensor)
        
        text = clean_text(self.data[index]['text'])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        target = torch.zeros(len(LABELS), dtype=torch.float)
        for label in self.data[index]['labels']:
            target[LABELS[label]] = 1.0

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'image': img_tensor,
            'target': target
        }

# Define model architecture
class MultiModalClassifier(nn.Module):
    def __init__(self, num_classes, roberta_model_name="roberta-base"):
        super(MultiModalClassifier, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.resnet_model = models.resnet50(pretrained=True)
        self.resnet_model.fc = nn.Identity()  # Remove the final classification layer of ResNet

        self.txt_dense1 = nn.Linear(768, 256)
        self.img_dense1 = nn.Linear(2048, 256)
        self.concat_dense1 = nn.Linear(512, 512)  # Adjust the output dimension of the concatenation
        self.concat_dense2 = nn.Linear(512, num_classes)
        self.dropout = nn.Dropout(p=0.4)
        self.bn1 = nn.BatchNorm1d(512)  # Adjust batch normalization dimension

    def forward(self, ids, mask, token_type_ids, img_input):
        _, bert_output = self.bert_model(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        txt_repr = self.dropout(F.relu(self.txt_dense1(bert_output)))

        img_repr = self.resnet_model(img_input)
        img_repr = self.dropout(F.relu(self.img_dense1(img_repr)))

        combined_repr = torch.cat((txt_repr, img_repr), dim=1)
        combined_repr = self.dropout(F.relu(self.concat_dense1(combined_repr)))  # Apply dense layer before batch normalization
        combined_repr = self.bn1(combined_repr)  # Apply batch normalization after adjusting dimension
        return F.softmax(self.concat_dense2(combined_repr), dim=1)

# Create DataLoader instances
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = ModelDataSet(tokenizer, max_length=512, data=train_data, image_path=IMAGE_DATA_TRAIN)
valid_dataset = ModelDataSet(tokenizer, max_length=512, data=valid_data, image_path=IMAGE_DATA_VALID)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=4, shuffle=False)

# Initialize model, loss function, and optimizer
model = MultiModalClassifier(num_classes=len(LABELS))
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) 

# Training function
def train_model(epochs, dataloader, model, loss_fn, optimizer):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        correct = 0
        total_samples = 0
        for batch in tqdm(dataloader, leave=False):
            ids = batch['ids']
            token_type_ids = batch['token_type_ids']
            mask = batch['mask']
            labels = batch['target']
            images = batch['image']
            
            optimizer.zero_grad()
            
            outputs = model(ids, mask, token_type_ids, images)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            total_samples += labels.size(0)
            # Convert predicted tensor to have the same shape as labels tensor
            predicted_one_hot = F.one_hot(predicted, num_classes=labels.size(1))
            correct += (predicted_one_hot == labels).all(dim=1).sum().item()  # Compare one-hot tensors

        accuracy = correct / total_samples
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss}, Accuracy: {accuracy}')

# Train the model
train_model(10, train_dataloader, model, loss_fn, optimizer)


# Evaluate function
def evaluate_model(dataloader, model):
    model.eval()
    total_samples = 0
    correct = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, leave=False):
            ids = batch['ids']
            token_type_ids = batch['token_type_ids']
            mask = batch['mask']
            labels = batch['target']
            images = batch['image']
            
            outputs = model(ids, mask, token_type_ids, images)
            _, predicted = torch.max(outputs, 1)
            total_samples += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total_samples
    print(f'Accuracy: {accuracy}')

# Train the model
train_model(10, train_dataloader, model, loss_fn, optimizer)

# Evaluate the model on validation set
evaluate_model(valid_dataloader, model)


  0%|          | 0/730 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
                                                        

Epoch [1/10], Loss: 513.932098031044, Accuracy: 0.0205620287868403


                                                     

Epoch [2/10], Loss: 512.5630480051041, Accuracy: 0.021247429746401644


                                                     

Epoch [3/10], Loss: 512.5664746761322, Accuracy: 0.02227553118574366


                                                     

Epoch [4/10], Loss: 512.5655576586723, Accuracy: 0.02227553118574366


                                                     

Epoch [5/10], Loss: 512.5859272480011, Accuracy: 0.02193283070596299


                                                     

Epoch [6/10], Loss: 512.5642596483231, Accuracy: 0.02193283070596299


                                                     

Epoch [7/10], Loss: 512.5697718858719, Accuracy: 0.02193283070596299


                                                     

Epoch [8/10], Loss: 512.5677834153175, Accuracy: 0.02227553118574366


                                                     

Epoch [9/10], Loss: 512.5979398488998, Accuracy: 0.02227553118574366


                                                     

Epoch [10/10], Loss: 512.5607372522354, Accuracy: 0.02227553118574366


                                                     

Epoch [1/10], Loss: 512.5647913217545, Accuracy: 0.022618231665524333


                                                   

KeyboardInterrupt: 

In [11]:
# Interrupt the kernel in Jupyter Notebook to stop the training

# Save the model's state dictionary
torch.save(model.state_dict(), 'model_checkpoint.pth')


In [14]:
pip install torchsummary


Collecting torchsummary
  Obtaining dependency information for torchsummary from https://files.pythonhosted.org/packages/7d/18/1474d06f721b86e6a9b9d7392ad68bed711a02f3b61ac43f13c719db50a6/torchsummary-1.5.1-py3-none-any.whl.metadata
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
Note: you may need to restart the kernel to use updated packages.


In [15]:
def evaluate_model(dataloader, model):
    model.eval()
    total_samples = 0
    correct = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, leave=False):
            ids = batch['ids']
            token_type_ids = batch['token_type_ids']
            mask = batch['mask']
            labels = batch['target']
            images = batch['image']
            
            outputs = model(ids, mask, token_type_ids, images)
            _, predicted = torch.max(outputs, 1)
            
            # Convert one-hot encoded labels to class indices
            _, labels_idx = torch.max(labels, 1)
            
            total_samples += labels.size(0)
            correct += (predicted == labels_idx).sum().item()
    
    accuracy = correct / total_samples
    print(f'Accuracy: {accuracy}')


In [16]:
evaluate_model(valid_dataloader, model)

                                                 

Accuracy: 0.1


