In [346]:
import sys

import torch.optim as optim
import time

import unicodedata
from sklearn import metrics
import clip
from clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
import argparse
import numpy as np
from torch import nn
from torch.nn import functional as F
import torch

In [347]:
import torch
import numpy as np
import re
import itertools

# from ekphrasis.classes.preprocessor import TextPreProcessor
# from ekphrasis.classes.tokenizer import SocialTokenizer
# from ekphrasis.dicts.emoticons import emoticons

from torch import nn
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, sampler
from torchvision import models, transforms

import pandas as pd
import os, random, copy, re
from PIL import Image
from PIL import ImageFile

In [348]:
class CustomDatasetFixed(Dataset):
    def __init__(self, data_df, phase, img_transform, preprocess, tokenize, max_length):
        self.data_df = data_df
        self.phase = phase
        self.img_transform = img_transform
        self.preprocess = preprocess
        self.tokenize = tokenize
        self.max_len = max_length
        self.dloc = 'dataset/'

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, idx):
        row = self.data_df.iloc[idx]
        
        img_id = row['image_id']
        caption = row['caption']
        id = row['id']
        label = -1 if self.phase == 'test' else row['label']
        
        # Validate image path
        img_path = os.path.join(self.dloc, self.phase+'_images', str(img_id)+'.jpg')
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image not found at {img_path}")
        img = Image.open(img_path).convert('RGB')
        img = self.img_transform(img)

        # Validate caption preprocessing and tokenization
        proc_caption = self.preprocess(caption)
        if proc_caption is None:
            raise ValueError(f"Preprocessing failed for caption: {caption}")
        caption_tokens, masks = self.tokenize(proc_caption)
        if caption_tokens is None or masks is None:
            raise ValueError(f"Tokenization failed for processed caption: {proc_caption}")
        
        return {
            'images': img,
            'caption_tokens': caption_tokens,
            'ids': id,
            'labels': label,
        }

In [349]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return text

def remove_URL(sample):
    """Remove URLs from a sample string"""
    sample = re.sub(r"\S+\.[(net)|(com)|(org)]\S+", "", sample)
    sample = re.sub(r"http\S+", "", sample)
    sample = re.sub(r"\d+", " ", sample)
    sample = re.sub(r"\s+", " ", sample)
    sample = re.sub(r"_", " ", sample)
    return sample

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', ' ', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    return words

def preprocess(sample):
    sample = remove_URL(sample)
    # Tokenize
    words = sample.split(' ')
    words = normalize(words)

    normalized_text = ''
    for w in words:
        normalized_text += w+' '

    return normalized_text.strip()

In [350]:
class MMNetwork(nn.Module):
    def __init__(self, vdim, tdim):
        super(MMNetwork, self).__init__()

        ## Linear layer for ResNet features
        self.vfc = nn.Linear(vdim, 256)

        ## Single Layer Bi-directional RNN with GRU cells. Projects 768 to 512
        self.bigru = nn.LSTM(tdim, hidden_size=256, num_layers=1, bidirectional=False, batch_first=True, bias=False)

        ## Concatenated Image and Text goes through this multi-layer network
        self.mfc1 = nn.Linear(512, 256)
        # self.mfc2 = nn.Linear(512, 256)
        # self.mfc3 = nn.Linear(256, 128)
        # self.mfc4 = nn.Linear(256, 128)

        self.cf = nn.Linear(256, 1)

        self.act = nn.ReLU()  ## ReLU
        self.vdp = nn.Dropout(0.2)
        self.tdp = nn.Dropout(0.2)

    def forward(self, vx, tx):
        # vx = self.vdp(self.vfc(vx))
        vx = self.vdp(self.act(self.vfc(vx)))

        tx = tx.unsqueeze(1)  # (batch_size=64, seq_len=1, feature_dim=768)
        _, (hidden_tx, _) = self.bigru(tx)  # (num_layers, batch_size, hidden_size=256)

        hidden_tx = hidden_tx.squeeze(0)  # (batch_size, hidden_size=256)

        # hidden = [n layers * n directions, batch size, emb dim]
        # hidden_tx = self.tdp(torch.cat((hidden_tx[0][-2,:,:], hidden_tx[0][-1,:,:]), dim = 1))
        ## Concatenate Visual and Textual output
        # mx = torch.cat((vx, hidden_tx), dim=1)
        mx = torch.cat((vx, self.tdp(hidden_tx)), dim=1)

        mx = self.act(self.mfc1(mx))
        # mx = self.act(self.mfc2(mx))
        # mx = self.relu(self.mfc3(mx))
        # mx = self.relu(self.mfc4(mx))

        return torch.sigmoid(self.cf(mx))

In [351]:
def tokenize(text, context_length: int = 77):
    sot_token = _tokenizer.encoder["<|startoftext|>"]
    eot_token = _tokenizer.encoder["<|endoftext|>"]
    # all_tokens = _tokenizer.encode(text)
    tokens = [sot_token] + _tokenizer.encode(text)[:context_length - 2] + [eot_token]
    result = torch.zeros(context_length, dtype=torch.long)
    mask = torch.zeros(context_length, dtype=torch.long)
    result[:len(tokens)] = torch.tensor(tokens)
    mask[:len(tokens)] = 1

    return result, mask


def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()

In [352]:
_tokenizer = _Tokenizer()


In [353]:
def train(model, optimizer, lr_scheduler, num_epochs):
    since = time.time()

    best_model = model
    best_acc = 0.0
    best_val_loss = 100
    best_epoch = 0
    best_f1 = 0

    for epoch in range(1, num_epochs + 1):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)

        since2 = time.time()

        model.train()  # Set model to training mode

        running_loss = 0.0
        running_corrects = 0

        tot = 0.0
        cnt = 0
        # Iterate over data.
        for batch in tr_loader:

            images = batch['images'].to(device)      # Images from batch
            caption_tokens = batch['caption_tokens'].to(device)  # Captions from batch
            labels = batch['labels'].to(device).float()  # Binary labels (0/1), converted to float

            # zero the parameter gradients
            optimizer.zero_grad()
            # forward
            with torch.no_grad():
                img_feats = clip_model.module.encode_image(images)
                txt_feats = clip_model.module.encode_text(caption_tokens)

            outputs = model(img_feats, txt_feats).squeeze(1)
            preds = (outputs > 0.5).int()

            loss = criterion(outputs, labels)

            # backward + optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            # statistics
            running_loss += loss.item()
            running_corrects += torch.sum(preds == labels).item()
            tot += len(labels)

            if cnt % 40 == 0:
                print('[%d, %5d] loss: %.4f, Acc: %.2f' %
                      (epoch, cnt + 1, loss.item(), (100.0 * running_corrects) / tot))

            cnt = cnt + 1

        if scheduler:
            lr_scheduler.step()

        train_loss = running_loss / len(tr_loader)
        train_acc = running_corrects * 1.0 / (len(tr_loader.dataset))

        print('Training Loss: {:.6f} Acc: {:.2f}'.format(train_loss, 100.0 * train_acc))

        test_loss, test_acc, test_f1 = evaluate(model, vl_loader)

        print(
            'Epoch: {:d}, Val Loss: {:.4f}, Acc: {:.2f}, F1: {:.2f}'.format(
                epoch, test_loss, test_acc * 100, test_f1 * 100))

        # deep copy the model
        if test_f1 >= best_f1:
            best_acc = test_acc
            best_val_loss = test_loss
            best_model = copy.deepcopy(model)
            best_epoch = epoch
            best_f1 = test_f1

    time_elapsed2 = time.time() - since2
    print('Epoch complete in {:.0f}m {:.0f}s'.format(
        time_elapsed2 // 60, time_elapsed2 % 60))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    return best_model, best_epoch

In [354]:
def evaluate(model, loader):
    model.eval()
    test_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tr_loader:

            images = batch['images'].to(device)      # Images from batch
            caption_tokens = batch['caption_tokens'].to(device)  # Captions from batch
            labels = batch['labels'].to(device).float()  # Binary labels (0/1), converted to float


            img_feats = clip_model.module.encode_image(images)
            txt_feats = clip_model.module.encode_text(caption_tokens)

            outputs = model(img_feats, txt_feats).squeeze(1)

            preds1 = (outputs > 0.5).int()

            test_loss += criterion(outputs, labels).item()

            all_preds.extend(preds1.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

        acc = metrics.accuracy_score(all_labels, all_preds)
        f1 = metrics.f1_score(all_labels, all_preds, average='macro')

    return test_loss / len(loader), acc, f1

In [355]:
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [356]:
parser = argparse.ArgumentParser(description='Train Multimodal Multi-task model for Misogyny Detection')
parser.add_argument('--bs', type=int, default=32, help='64,128')
parser.add_argument('--epochs', type=int, default=20)
parser.add_argument('--maxlen', type=int, default=60)
parser.add_argument('--lr', type=str, default='1e-4', help='3e-5, 4e-5, 5e-5, 5e-4')
parser.add_argument('--vmodel', type=str, default='rn50', help='resnet | vit32 | vit16 | vit14 | rn50 | rn101 | rn504 | rn5016 | rn5064')


argv = sys.argv
if '-f' in argv:
    argv = argv[:argv.index('-f')]  # Exclude Jupyter's `-f` argument and everything after it
args = parser.parse_args(argv[1:])
## Arguments
batch_size = args.bs
init_lr = float(args.lr)
epochs = args.epochs
vmodel = args.vmodel

## Pre-trained Stream Models
clip_nms = {'vit32': 'ViT-B/32', 'vit16': 'ViT-B/16', 'rn50': 'RN50', 'rn504': 'RN50x4', 'rn101': 'RN101',
            'rn5016': 'RN50x16', 'rn5064': 'RN50x64', 'vit14': 'ViT-L/14'}
clip_dim = {'vit32': 512, 'vit16': 512, 'vit14': 768, 'rn50': 1024, 'rn504': 640, 'rn101': 512, 'rn5016': 768,
            'rn5064': 1024}
clip_model, _ = clip.load(clip_nms[vmodel], jit=False)
input_resolution = clip_model.visual.input_resolution
clip_model.float().eval()
clip_model = nn.DataParallel(clip_model)

dim = clip_dim[vmodel]

## Transforms
transform_config = {'train': transforms.Compose([
    transforms.RandomResizedCrop(input_resolution, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.RandomHorizontalFlip(),
    # transforms.ColorJitter(brightness=0.2, contrast=0.1, saturation=0.1, hue=0.2),
    transforms.RandomGrayscale(p=0.1),
    # transforms.RandomPerspective(),
    transforms.RandomRotation(degrees=10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                         std=[0.26862954, 0.26130258, 0.27577711])
]),
    'test': transforms.Compose([
        transforms.Resize((input_resolution, input_resolution), interpolation=transforms.InterpolationMode.BICUBIC),
        # transforms.CenterCrop(clip_model.visual.input_resolution),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                             std=[0.26862954, 0.26130258, 0.27577711])
    ])
}

## Dataset
train_df = pd.read_csv("./dataset/train.csv")
val_df = pd.read_csv("./dataset/val.csv")
test_df = pd.read_csv("./dataset/test.csv")
if args.maxlen != 0:
    max_length = args.maxlen

In [357]:
tr_data = CustomDatasetFixed(train_df, 'train', transform_config['train'], preprocess, tokenize, max_length)
vl_data = CustomDatasetFixed(val_df, 'val', transform_config['test'], preprocess, tokenize, max_length)
ts_data = CustomDatasetFixed(test_df, 'test', transform_config['test'], preprocess, tokenize, max_length)
tr_loader = DataLoader(tr_data, shuffle=True, num_workers=0, batch_size=batch_size)
vl_loader = DataLoader(vl_data, num_workers=0, batch_size=batch_size)
ts_loader = DataLoader(ts_data, num_workers=0, batch_size=batch_size)

In [358]:
# from torch.nn import BCELoss
# 
# # Test model code
# #  , num_heads, transformer_hidden_dim, num_transformer_layers
# model = MMNetwork(dim, dim).to(device)
# criterion = BCELoss()
# # Dummy input data
# images = torch.randn(16, 3, 224, 224).long().to(device)  # Batch of 16 RGB images of size 224x224
# captions = torch.randint(0, 1000, (16, max_length)).long().to(device)  # Batch of 16 captions with max_len tokens each
# labels = torch.rand(16).float().to(device)
# with torch.no_grad():
#     img_feats = clip_model.module.encode_image(images)
#     txt_feats = clip_model.module.encode_text(captions)
# 
# output = model(img_feats, txt_feats).squeeze(1)
# output = (output > 0.5).float()
# loss = criterion(output, labels)
# print(output, loss)  # Should be (16) 

In [359]:
## Model
model = MMNetwork(dim, dim)

model.to(device)
# print(model)

optimizer = optim.Adam(model.parameters(), init_lr, betas=(0.99, 0.98), weight_decay=1e-4)
criterion = nn.BCELoss()

num_train_steps = int(len(tr_data) / batch_size) * epochs
num_warmup_steps = int(0.1 * num_train_steps)
# print(num_train_steps, num_warmup_steps)  ## Print Number of total and warmup steps
# scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [5, 10, 15], gamma=0.5)
# scheduler = None

model_ft, best_epoch = train(model, optimizer, scheduler, num_epochs=epochs)

torch.save(model_ft.state_dict(), 'best_models/trained_model.pt')

vl_loss, vl_acc, vl_f1 = evaluate(model_ft, vl_loader)
print('Validation best epoch: %d, Val Loss: %.4f, ACC: %.2f, F1: %.2f' % (
best_epoch, np.round(vl_loss, 4), vl_acc * 100, vl_f1 * 100))

# ts_loss, ts_acc, ts_f1= evaluate(model_ft, ts_loader)
# print('Test results:, Test Loss: %.4f, ACC: %.2f, F1: %.2f' % (np.round(ts_loss, 4), ts_acc * 100, ts_f1 * 100))

Epoch 1/20
----------
[1,     1] loss: 0.6947, Acc: 46.88
[1,    41] loss: 0.6848, Acc: 57.55
[1,    81] loss: 0.6975, Acc: 57.52
[1,   121] loss: 0.6651, Acc: 58.73
[1,   161] loss: 0.6935, Acc: 59.69
[1,   201] loss: 0.6427, Acc: 59.65
[1,   241] loss: 0.6671, Acc: 60.10
[1,   281] loss: 0.5989, Acc: 60.39
Training Loss: 0.669115 Acc: 60.70
Epoch: 1, Val Loss: 2.1171, Acc: 64.66, F1_1: 64.63
Epoch 2/20
----------
[2,     1] loss: 0.6811, Acc: 56.25
[2,    41] loss: 0.6323, Acc: 65.62
[2,    81] loss: 0.7334, Acc: 65.97
[2,   121] loss: 0.6672, Acc: 65.99
[2,   161] loss: 0.6859, Acc: 66.36
[2,   201] loss: 0.6337, Acc: 66.60
[2,   241] loss: 0.6432, Acc: 66.38
[2,   281] loss: 0.6047, Acc: 66.27
Training Loss: 0.618972 Acc: 66.28
Epoch: 2, Val Loss: 1.9890, Acc: 68.56, F1_1: 68.56
Epoch 3/20
----------
[3,     1] loss: 0.5910, Acc: 65.62
[3,    41] loss: 0.5544, Acc: 68.90
[3,    81] loss: 0.5681, Acc: 69.56
[3,   121] loss: 0.5151, Acc: 69.37
[3,   161] loss: 0.6236, Acc: 69.00
[3, 

KeyboardInterrupt: 

In [None]:
from tqdm import tqdm

ids = []
predictions = []

def make_submission(model, test_loader):
    model.eval()

    with torch.no_grad():
        for batch in tqdm(test_loader):
            images = batch['images'].to(device)      # Images from batch
            caption_tokens = batch['caption_tokens'].to(device)  # Captions from batch
            labels = batch['labels'].to(device).float()  # Binary labels (0/1), converted to float
            id = batch['ids']

            img_feats = clip_model.module.encode_image(images)
            txt_feats = clip_model.module.encode_text(caption_tokens)

            outputs = model(img_feats, txt_feats).squeeze(1)

            preds = (outputs > 0.5).int()
            
            ids.extend(id)
            predictions.extend(preds.cpu().tolist())
            
    df = pd.DataFrame({'id': ids, 'label': predictions})
    df.to_csv('submission_final2.csv', index=False)

In [None]:
make_submission(model, ts_loader)