<a href="https://www.kaggle.com/code/aviv360/memesense?scriptVersionId=135511283" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install --upgrade pip
!pip install --upgrade torch==1.7
!pip install pandas_path
!pip install optuna
!pip install pymysql
!pip install --upgrade wandb
!pip install --upgrade transformers

In [None]:
!pip install git+https://github.com/openai/CLIP.git

In [None]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import time
import sys
import os
import math
from typing import Tuple
import seaborn as sn
import pandas as pd
import pandas_path  # Path style access for pandas
import json
from pathlib import Path
import logging
import random
import tarfile
import tempfile
import warnings

from transformers import  CLIPVisionModel, CLIPVisionConfig, AutoTokenizer, CLIPTokenizer
import transformers
from sklearn.metrics import roc_auc_score, accuracy_score

# pytorch
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset, WeightedRandomSampler, DataLoader
from torch.optim.lr_scheduler import StepLR, ExponentialLR, ReduceLROnPlateau
from torch.cuda.amp import autocast, GradScaler

# from efficientnet_pytorch import EfficientNet
from ignite.contrib.handlers import create_lr_scheduler_with_warmup

import torchvision
from torchvision import transforms, utils

from PIL import Image, ImageFilter, ImageEnhance
from tqdm import tqdm

# optuna for hyperparameter optimization
import optuna
from optuna.trial import TrialState
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_intermediate_values, plot_optimization_history, plot_parallel_coordinate, plot_slice

import kornia
from kornia import augmentation as K
from kornia.augmentation import AugmentationSequential


import torch
import clip
from PIL import Image

import pymysql
pymysql.install_as_MySQLdb()


from IPython.display import clear_output

device0 = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device1 = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
def getTokenizer():
    return CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") #AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

def gettextmodel(configuration):
    text_model = clip.load("ViT-B/32", device=device0)
    return text_model

In [None]:
"""def getTokenizer():
    return clip.tokenize

def gettextmodel(configuration):
    text_model = AutoModel.from_pretrained("distilbert-base-uncased")
    return text_model"""

In [None]:
data_dir = Path.cwd().parent / "input" / "hate-memes" / "hateful_memes"

img_path = data_dir / "img"
train_path = data_dir / "train.jsonl"
dev_path = data_dir / "dev_seen.jsonl"
test_path = data_dir / "test_seen.jsonl"

In [None]:
train = pd.read_json("/kaggle/input/hate-memes/hateful_memes/train.jsonl",lines=True)
val = pd.read_json("/kaggle/input/hate-memes/hateful_memes/dev_seen.jsonl",lines=True)
test = pd.read_json("/kaggle/input/hate-memes/hateful_memes/test_seen.jsonl",lines=True)

In [None]:
train.head()

In [None]:
train.label.value_counts()

In [None]:
train["label"].value_counts().plot(kind="bar")

Blancing train set - to prevent biased model and poor performance.
We'll use oversampling for that.

In [None]:
plt.figure(figsize=(10,6))
img = plt.imread(f"/kaggle/input/hate-memes/hateful_memes/img/42953.png")
plt.imshow(img)

In [None]:
images = [
    Image.open(f"/kaggle/input/hate-memes/hateful_memes/{train.img[i]}").convert("RGB")
    for i in range(5)
]

for image in images:
    print(image.size)

In [None]:

hparams = {
    
    # Required hparams
    "train_path": train_path,
    "dev_path": dev_path,
    "img_dir": data_dir,
    
    # Optional hparams
    "image_dim": 224,
    "text_feature_dim": 512,
    "vision_feature_dim": 512,
    "fusion_mid_size": 512,
    "fusion_output_size": 1024,
    "output_path": "model-outputs",
    "dev_limit": None,
    "lr": 5e-5,
    "max_epochs": 10,
    "batch_size": 64,
    "early_stop_patience": 41,
    "bert_dropout_p": 0.2,
    "bert_attn_dropout_p": 0.2,
    "dropout_fusion_mid_p": 0.1,
    "dropout_fusion_out_p": 0.05,
    "dropout_vision_feature_p": 0.05,
    "dropout_text_feature_p": 0.05,

    # Image augmentations params
    "horizontal_flip_p": 0.5,        
    "rotation": 3,
    "brightness": 0.02,
    "contrast": 0.02,
    "saturation": 0.02,
    "hue": 0.02,
    "random_erasing_p": 0.5,
    "random_erasing_scale_min": 0.02,
    "random_erasing_scale_max": 0.033,
    "random_erasing_ratio_min": 0.3,
    "random_erasing_ratio_max": 3.3,
    "random_erasing_value": 0,
    "gaussian_noise_std": 0.1,
    "enable_augmentation": True,
    
    
}

In [None]:

# define a callable image_transform with Compose
def imgtrans(hparams):
    if (hparams["enable_augmentation"] == False):
        return AugmentationSequential(
            K.Resize(size=(hparams["image_dim"],hparams["image_dim"])),
            K.Normalize(mean=torch.tensor([0.485, 0.456, 0.406]),
                                          std=torch.tensor([0.229, 0.224, 0.225])),
        )
    else:
        return AugmentationSequential(
            K.Resize(size=(hparams["image_dim"],hparams["image_dim"])),
            K.RandomHorizontalFlip(p=hparams["horizontal_flip_p"]),
            K.RandomRotation(degrees=hparams["rotation"]),
            K.ColorJitter(brightness=hparams["brightness"],
                                            contrast=hparams["contrast"],
                                            saturation=hparams["saturation"],
                                            hue=hparams["hue"]),
            K.RandomErasing(p=hparams["random_erasing_p"],
                                            scale=(hparams["random_erasing_scale_min"],
                                                    hparams["random_erasing_scale_max"]),
                                            ratio=(hparams["random_erasing_ratio_min"],
                                                    hparams["random_erasing_ratio_max"]),
                                            value=hparams["random_erasing_value"]),
            K.Normalize(mean=torch.tensor([0.485, 0.456, 0.406]),
                                          std=torch.tensor([0.229, 0.224, 0.225])),
            K.RandomGaussianNoise(mean=0., std=hparams["gaussian_noise_std"]),
            K.Resize(size=(hparams["image_dim"], hparams["image_dim"])),
            K.RandomCrop(size=(hparams["image_dim"], hparams["image_dim"]),
                                            padding=None,
                                            pad_if_needed=False,
                                            fill=0,
                                            padding_mode='constant'),
            same_on_batch=False
        )
PILtoTesor = transforms.ToTensor()
image_transform = imgtrans(hparams)

# convert the images and prepare for visualization.
tensor_img = torch.stack(
    [image_transform(PILtoTesor(image)).squeeze() for image in images]
)
grid = utils.make_grid(tensor_img)

# plot
plt.rcParams["figure.figsize"] = (20, 5)
plt.axis('off')
_ = plt.imshow(grid.permute(1, 2, 0))

In [None]:
class HatefulMemesDataset(torch.utils.data.Dataset):
    """Uses jsonl data to preprocess and serve 
    dictionary of multimodal tensors for model input.
    """

    def __init__(
        self,
        data_path,
        img_dir,
        image_transform,
        text_transform,
        balance=False,
        dev_limit=None,
        random_state=0,
    ):

        self.samples_frame = pd.read_json(
            data_path, lines=True
        )
        # Reset the index of the DataFrame (for sampler use)
        # self.samples_frame.reset_index(drop=True, inplace=True)

        self.dev_limit = dev_limit
        if balance:
            neg = self.samples_frame[
                self.samples_frame.label.eq(0)
            ]
            pos = self.samples_frame[
                self.samples_frame.label.eq(1)
            ]
            '''self.samples_frame = pd.concat(
                [
                    neg,
                    pos.sample(
                        neg.shape[0], 
                        random_state=random_state,
                        replace=True
                    ) 
                ]
            )'''
            self.samples_frame = pd.concat(
                [
                        
                    neg.sample(
                        pos.shape[0], 
                        random_state=random_state
                    ), 
                    pos
                ]
            )
            
        if self.dev_limit:
            if self.samples_frame.shape[0] > self.dev_limit:
                self.samples_frame = self.samples_frame.sample(
                    dev_limit, random_state=random_state
                )
        self.samples_frame = self.samples_frame.reset_index(
            drop=True
        )
        self.samples_frame.img = self.samples_frame.apply(
            lambda row: (img_dir / row.img), axis=1
        )

        # https://github.com/drivendataorg/pandas-path
        if not self.samples_frame.img.apply(lambda x: (img_dir / x).exists()).all():
            raise FileNotFoundError
        if not (self.samples_frame.img.apply(lambda x: (img_dir / x).is_file())).all():
            raise TypeError
            
        self.image_transform = image_transform
        self.text_transform = text_transform

    def __len__(self):
        """This method is called when you do len(instance) 
        for an instance of this class.
        """
        return len(self.samples_frame)

    def __getitem__(self, idx):
        """This method is called when you do instance[key] 
        for an instance of this class.
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_id = self.samples_frame.loc[idx, "id"]

        
        
        image = Image.open(
            self.samples_frame.loc[idx, "img"]
        ).convert("RGB")
        image = self.image_transform(PILtoTesor(image)).squeeze()

        text = torch.Tensor(
            self.text_transform.encode_plus(
                self.samples_frame.loc[idx, "text"],
                add_special_tokens=True,
                max_length=77,
                pad_to_max_length=True,
                truncation=True,
            )["input_ids"]
        ).long().squeeze()

        if "label" in self.samples_frame.columns:
            label = torch.Tensor(
                [self.samples_frame.loc[idx, "label"]]
            ).long().squeeze()
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text, 
                "label": label
            }
        else:
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text
            }

        return sample

In [None]:
class textAndVisionConcat(torch.nn.Module):
    def __init__(
        self,
        num_classes,
        loss_fn,
        text_module,
        vision_module,
        text_feature_dim,
        vision_feature_dim,
        fusion_mid_size,
        fusion_output_size,
        dropout_vision_feature_p,
        dropout_fusion_mid_p,
        dropout_fusion_out_p,
        dropout_text_feature_p
    ):
        super(textAndVisionConcat, self).__init__()

        self.text_module = text_module
        self.text_feature_dropout = torch.nn.Dropout(dropout_text_feature_p)
        self.vision_module = vision_module
        self.vision_feature_dropout = torch.nn.Dropout(dropout_vision_feature_p)

        self.fusion = torch.nn.Sequential(
            torch.nn.Linear(
                in_features=text_feature_dim+vision_feature_dim,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_mid_size
            ),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_fusion_mid_p),
            torch.nn.Linear(
                in_features=fusion_mid_size,
                out_features=fusion_output_size
            ),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_fusion_out_p)
        )


        self.fc = torch.nn.Linear(
            in_features=fusion_output_size, 
            out_features=num_classes
        )
        
        self.loss_fn = loss_fn
        

        # initialize weights
        # initialize weights of fusion layers
        for layer in self.fusion:
            if isinstance(layer, torch.nn.Linear):
                torch.nn.init.xavier_uniform_(layer.weight)
                torch.nn.init.zeros_(layer.bias)
        # initialize weights of fc layer
        torch.nn.init.xavier_uniform_(self.fc.weight)
        torch.nn.init.zeros_(self.fc.bias)
        
    def forward(self, text, image, label=None):
        text_features = torch.nn.functional.relu(
            self.text_module(text)
        )

        image_features = torch.nn.functional.relu(
            self.vision_module(image)
        )

        combined = torch.cat(
            [text_features, image_features], dim=1
        )
        combined = combined.to(torch.float32)
        
        # Attention mechanism
        '''attention_weights = torch.softmax(torch.mul(text_features, image_features), dim=1)
        weighted_text_features = torch.mul(text_features, attention_weights)
        weighted_image_features = torch.mul(image_features, attention_weights)
        
        combined = torch.cat(
            [weighted_text_features, weighted_image_features], dim=1
        )''' 
        
        
        fused = self.fusion(combined)
        #fused = self.fusion(combined)
        
        logits = self.fc(fused)
        pred = torch.nn.functional.softmax(logits, dim=1)
        loss = (
            self.loss_fn(pred, label) 
            if label is not None else label
        )
        return (pred, loss)

In [None]:
class HatefulMemesModel(torch.nn.Module):
    def __init__(self, hparams):
        for data_key in ["train_path", "dev_path", "img_dir",]:
            # ok, there's one for-loop but it doesn't count
            if data_key not in hparams.keys():
                raise KeyError(
                    f"{data_key} is a required hparam in this model"
                )
        
        super(HatefulMemesModel, self).__init__()
        self.hparams = hparams
        
        # assign some hparams that get used in multiple places
        self.text_feature_dim = self.hparams.get(
            "text_feature_dim", 300
        )
        self.vision_feature_dim = self.hparams.get(
            # balance text and vision features by default
            "vision_feature_dim", self.text_feature_dim
        )
        self.output_path = Path(
            self.hparams.get("output_path", "model-outputs")
        )
        self.output_path.mkdir(exist_ok=True)
        
        # instantiate transforms, datasets
        self.text_transform = self._build_text_transform()
        self.image_transform = self._build_image_transform()
        self.train_dataset = self._build_dataset("train_path")
        print(f"Train size: {len(self.train_dataset)}")
        self.dev_dataset = self._build_dataset("dev_path")
        print(f"val size: {len(self.dev_dataset)}")
        
        # set up model and training
        self.model = self._build_model().to(device0) 
        self.optimizer, self.scheduler = self.configure_optimizers() 
        
    def forward(self, text, image, label=None):
        return self.model(text, image, label)

    def training_step(self, batch, batch_nb):
        preds, loss = self.forward(
            text=batch["text"].to(device0), 
            image=batch["image"].to(device0), 
            label=batch["label"].to(device0)
        )
        
        return preds, loss

    def validation_step(self, batch, batch_nb):
        preds, loss = self.eval().forward(
            text=batch["text"].to(device0), 
            image=batch["image"].to(device0), 
            label=batch["label"].to(device0)
        )
        
        return preds, loss

    # TODO - clac validation error here...
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack(
            tuple(
                output["batch_val_loss"] 
                for output in outputs
            )
        ).mean()
        
        return {
            "val_loss": avg_loss,
            "progress_bar":{"avg_val_loss": avg_loss}
        }

    def configure_optimizers(self):
        # AdamW with L2 regularization.
        optimizer = torch.optim.AdamW(self.model.parameters(), 
                                      lr=self.hparams.get("lr", 0.001))

    
        
        '''optimizer = torch.optim.SGD(self.model.parameters(), lr=self.hparams.get("lr", 0.001), 
                                    momentum=0.9, nesterov=True)'''

      
        
        # scheduler = ExponentialLR(optimizer, gamma=0.8)
        
        # patience = 0, after 1 bad epoch, reduce LR
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.25, patience=3, verbose=True)
        
        # scheduler = StepLR(optimizer, step_size=2, gamma=0.5)

        # warmup_duration - number of epochs for which the warm-up will be performed.
        # doesnt seem to help raising the lr - causes overfitting.
        '''scheduler_with_warmup = create_lr_scheduler_with_warmup(scheduler,
                                           warmup_start_value=1e-5,
                                           warmup_duration=2,
                                           warmup_end_value=1e-4)'''
        
        return optimizer, scheduler
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            shuffle=True,
            batch_size=self.hparams.get("batch_size", 64), 
            num_workers=self.hparams.get("num_workers", 2),
            pin_memory=True
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.dev_dataset, 
            shuffle=False, 
            batch_size=int(math.ceil(len(self.dev_dataset)/4)), 
            num_workers=self.hparams.get("num_workers", 2)
        )
    
    def fit(self):
        self._set_seed(self.hparams.get("random_state", 42))
        self.epochs = self.hparams.get("max_epochs", 10)
        self.train_loader = self.train_dataloader()
        self.val_loader = self.val_dataloader()
        # self.best_val_loss = float('inf')
        self.best_error_rate = 1.0
        self.epochs_without_improvement = 0
        
        lrs = [] 
                        
        for epoch in range(1, self.epochs + 1):
            
            lrs.append(self.optimizer.param_groups[0]['lr'])
            
            # Updates lr (for warmup)
            # self.scheduler(None) 
            
            sys.stderr.flush()
            
            print('Epoch {}, lr {}'.format(
                epoch, self.optimizer.param_groups[0]['lr']), flush=True)
            
            self.train_epoch(epoch) 
            is_early_stopping_happened = self.val_epoch()[1]
            
            if is_early_stopping_happened:
                break
                
        plt.plot(range(1, len(lrs) + 1), lrs)
        plt.title("Learning Rate Schedule")
        plt.xlabel("Epoch")
        plt.ylabel("Learning Rate")
        # Set the tick locations and labels for integer values
        plt.xticks(range(1, len(lrs) + 1, 1))
        plt.show()
    
    def train_epoch(self, epoch):
        self.model.train()
        
        train_loss = 0
        num_batches = len(self.train_loader)
        
        scaler = GradScaler(enabled=True)
                
        # Initialize tqdm progress bar
        pbar = tqdm(total=num_batches, 
                    desc=f"Epoch {epoch}", 
                    unit="batch",
                    position=0, 
                    leave=True)
                
        for batch_idx, batch in enumerate(self.train_loader):
                
            # Runs the forward pass with autocasting.
            #with autocast(enabled=True):
            preds, loss = self.training_step(batch, batch_idx)
                
            self.optimizer.zero_grad(set_to_none=True)
                            

            loss.backward()
            # Update model parameters.
            
            self.optimizer.step()
            
            
            train_loss += loss.item()
            
            # Update tqdm progress bar
            pbar.set_postfix({"Train Loss":  f"{(train_loss / (batch_idx + 1)):.4f}"})
            pbar.update()
            
        pbar.close() 
        sys.stderr.flush()
                    
        train_loss /= len(self.train_loader)
        
        print(f"Epoch: {epoch}, Train Loss: {train_loss:.4f}", flush=True) 
        
        
    
    @torch.no_grad()
    def val_epoch(self):
        self.model.eval()
        
        # val_loss = 0
        num_errors = 0
        error_rate = 0.0
        num_batches = len(self.val_loader)
        
        error_rates = []
        
        # Initialize tqdm progress bar
        '''pbar = tqdm(total=num_batches, 
                    desc=f"Validation", 
                    unit="batch",
                    position=0, 
                    leave=True)'''
        
        pbar = tqdm(total=num_batches, 
                    desc=f"Validation Error Rate", 
                    unit="batch",
                    position=0, 
                    leave=True)
                
        for batch_idx, batch in enumerate(self.val_loader):
            preds, loss = self.validation_step(batch, batch_idx)
            
            # val_loss += loss.item()
            
            predicted_labels = preds.argmax(dim=1).to('cpu')
            
            actual_labels = batch['label'].to('cpu')

            # Calculate the number of misclassified samples (zero - one loss)
            error_rate = (predicted_labels != actual_labels).float().mean().item()
            
            error_rates.append(error_rate)
            
            # Update tqdm progress bar
            # pbar.set_postfix({"Validation Loss": val_loss / (batch_idx + 1)})
            pbar.set_postfix({"Validation Error Rate": f"{error_rate:.4f}"})
            pbar.update()
        
        pbar.close()
        sys.stderr.flush()
        
        # val_loss /= len(self.val_loader)
        
        mean_error_rate = np.array(error_rates).mean()

        # print(f"Validation Loss: {val_loss:.4f}", flush=True)
        print(f"Validation Error Rate: {mean_error_rate:.4f}", flush=True)
        
        # Updates lr (without warmup)
        self.scheduler.step(mean_error_rate)

        #if val_loss < self.best_val_loss:
        if mean_error_rate < self.best_error_rate:
            # self.best_val_loss = val_loss
            self.best_error_rate = mean_error_rate
            self.save_model()
            self.epochs_without_improvement = 0

        else:
            self.epochs_without_improvement += 1

            if self.epochs_without_improvement >= self.hparams.get("early_stop_patience", 5):
                print("Training stopped due to early stopping.")
                sys.stderr.flush()
                return (mean_error_rate,True)
        
        return (mean_error_rate, False)
    
    def save_model(self):
        output_path = os.path.join(self.hparams.get("output_path"), "best_model.ckpt")
        torch.save(self.model.state_dict(), output_path)
        
    def _set_seed(self, seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def _build_text_transform(self):
            # using the tokenizer 
            text_transform = getTokenizer()
            return text_transform

        
    def _build_image_transform(self):
        return imgtrans(self.hparams)
        
    def _build_dataset(self, dataset_key):
        return HatefulMemesDataset(
            data_path=self.hparams.get(dataset_key, dataset_key),
            img_dir=self.hparams.get("img_dir"),
            image_transform=self.image_transform,
            text_transform=self.text_transform,
            # limit training samples only
            dev_limit=(
                self.hparams.get("dev_limit", None) 
                if "train" in str(dataset_key) else None
            ),
            # blance = True for train causes undersampling
            # of train data.
            balance=True if "train" in str(dataset_key) else False,
        )
        
    def _build_model(self):
        model, preprocess = clip.load("ViT-B/32", device=device0)

        # load pretrained text model
        text_module = model.encode_text
        vision_module = model.encode_image
        
        
      
        
        return textAndVisionConcat(
            num_classes=self.hparams.get("num_classes", 2),
            loss_fn=torch.nn.CrossEntropyLoss(),
            text_module=text_module,
            vision_module=vision_module,
            text_feature_dim=self.text_feature_dim,
            vision_feature_dim=self.vision_feature_dim,
            fusion_mid_size=self.hparams.get("fusion_mid_size", 1024),
            fusion_output_size=self.hparams.get(
                "fusion_output_size", 512
            ),
            dropout_fusion_mid_p=self.hparams.get("dropout_fusion_mid_p"),
            dropout_fusion_out_p=self.hparams.get("dropout_fusion_out_p"),
            dropout_vision_feature_p=self.hparams.get("dropout_vision_feature_p"),
            dropout_text_feature_p=self.hparams.get("dropout_text_feature_p"),
        )

    @torch.no_grad()
    def make_submission_frame(self, test_path):
        test_dataset = self._build_dataset(test_path)
        submission_frame = pd.DataFrame(
            index=test_dataset.samples_frame.id,
            columns=["proba", "label"]
        )
        test_dataloader = torch.utils.data.DataLoader(
            test_dataset, 
            shuffle=False, 
            batch_size=int(math.ceil(len(test_dataset)/4)), 
            num_workers=self.hparams.get("num_workers", 2)
        )

        self.model.eval()
        for batch in tqdm(test_dataloader, total=len(test_dataloader), position=0, leave=True):
            preds, _ = self.model.to(device0)(
                batch["text"].to(device0), batch["image"].to(device0)
            )
            
            preds = preds.to("cpu")
            
            submission_frame.loc[batch["id"], "proba"] = preds[:, 1]
            submission_frame.loc[batch["id"], "label"] = preds.argmax(dim=1)
        submission_frame.proba = submission_frame.proba.astype(float)
        submission_frame.label = submission_frame.label.astype(int)
        return submission_frame

In [None]:
hateful_memes_model = HatefulMemesModel(hparams=hparams)

In [None]:
hateful_memes_model.fit()


the architecture of the model:
    # 1. text_transform: FastText num of parameters: 0 because we use the pretrained model without fine-tuning
    2. image_transform: ResNet152 num of parameters: 58,279,234
    3. text_module: Linear(in_features=300, out_features=512, bias=True) num of parameters: 153,600
    4. vision_module: Linear(in_features=2048, out_features=512, bias=True) num of parameters: 1,049,088
    5. fusion: Linear(in_features=1024, out_features=512, bias=True) num of parameters: 524,800
    6. output: Linear(in_features=512, out_features=2, bias=True) num of parameters: 1,026
    total num of parameters: 59,007,748
"""

In [None]:

#study = optuna.create_study(direction="minimize",study_name="test2",storage = uri2)

# define study
study = optuna.load_study(study_name="test2", storage=uri2) # uri2 = your DB uri

In [None]:
hparams["max_epochs"] = 6
hparams["dev_limit"] = None
hparams["batch_size"] = 64

In [None]:
# use optuna to tune hyperparameters
# define objective function
def objective(trial):
    # sample hpsearch params

    hparams["lr"] = trial.suggest_float("lr",1e-5, 1e-3, log=True)
    #hparams["dropout_in_p"] = trial.suggest_float("dropout_in_p", 0, 0.5, step=0.01)
    #hparams["dropout_mid_p"] = trial.suggest_float("dropout_mid_p", 0, 0.5, step=0.01)
    #hparams["dropout_out_p"] = trial.suggest_float("dropout_out_p", 0, 0.5, step=0.01)
    #hparams["bert_dropout_p"] = trial.suggest_float("bert_dropout_p", 0, 0.5, step=0.01)
    #hparams["bert_attn_dropout_p"] = trial.suggest_float("bert_attn_dropout_p", 0, 0.5, step=0.01)
    hparams["dropout_fusion_mid_p"] = trial.suggest_float("dropout_fusion_mid_p", 0, 0.5, step=0.01)
    hparams["dropout_fusion_out_p"] = trial.suggest_float("dropout_fusion_out_p", 0, 0.5, step=0.01)
    hparams["dropout_vision_feature_p"] = trial.suggest_float("dropout_vision_feature_p", 0, 0.5, step=0.01)
    hparams["dropout_text_feature_p"] = trial.suggest_float("dropout_text_feature_p", 0, 0.5, step=0.01)
    
    hparams["horizontal_flip_p"] = trial.suggest_float("horizontal_flip_p", 0.0, 1.0)
    hparams["rotation"] = trial.suggest_int("rotation", 0, 15)
    hparams["brightness"] = trial.suggest_float("brightness", 0.0, 0.3)
    hparams["contrast"] = trial.suggest_float("contrast", 0.0, 0.3)
    hparams["saturation"] = trial.suggest_float("saturation", 0.0, 0.3)
    hparams["hue"] = trial.suggest_float("hue", 0.0, 0.3)
    hparams["random_erasing_p"] = trial.suggest_float("random_erasing_p", 0.0, 1.0)
    hparams["random_erasing_scale_min"] = trial.suggest_float("random_erasing_scale_min", 0.0, 0.1)
    hparams["random_erasing_scale_max"] = trial.suggest_float("random_erasing_scale_max", hparams["random_erasing_scale_min"], 0.5)
    hparams["random_erasing_ratio_min"] = trial.suggest_float("random_erasing_ratio_min", 0.1, 1.0)
    hparams["random_erasing_ratio_max"] = trial.suggest_float("random_erasing_ratio_max", 1.0, 10.0)
    hparams["random_erasing_value"] = trial.suggest_float("random_erasing_value", 0.0, 0.6)
    hparams["gaussian_noise_std"] = trial.suggest_float("gaussian_noise_std", 0.0, 1.1)

    
    # train model
    model = HatefulMemesModel(hparams=hparams)
    model._set_seed(model.hparams.get("random_state", 42))
    model.epochs = model.hparams.get("max_epochs", 10)
    model.train_loader = model.train_dataloader()
    model.val_loader = model.val_dataloader()
    # model.best_val_loss = float('inf')
    model.best_error_rate = 1.0
    model.epochs_without_improvement = 0

    
    best_error_rate = 1
    for epoch in range(1, model.epochs + 1):
        
        
        #clear_output(wait=False)
        
        model.train_epoch(epoch)
        
        #validation:
        with torch.no_grad():

            model.model.eval()

            num_errors = 0
            error_rate = 0.0
            num_batches = len(model.val_loader)

            error_rates = torch.Tensor()

            # Initialize tqdm progress bar        
            #pbar = tqdm(total=num_batches, 
            #            desc=f"Validation Error Rate", 
            #            unit="batch",
            #            position=0, 
            #            leave=True)

            for batch_idx, batch in enumerate(model.val_loader):
                preds, loss = model.validation_step(batch, batch_idx)

                # val_loss += loss.item()

                predicted_labels = preds.argmax(dim=1).to('cpu',torch.float32)

                actual_labels = batch['label'].to('cpu',torch.float32)

                # Calculate the number of misclassified samples (zero - one loss)
                error_rate = (predicted_labels != actual_labels)#.mean().item()

                error_rates = torch.cat([error_rates,(predicted_labels != actual_labels)])#, dim=0)

                # Update tqdm progress bar
                # pbar.set_postfix({"Validation Loss": val_loss / (batch_idx + 1)})
                #pbar.set_postfix({"Validation Error Rate": f"{error_rate:.4f}"})
                #pbar.update()

            #pbar.close()
            sys.stderr.flush()

            # val_loss /= len(model.val_loader)

            mean_error_rate = torch.mean(error_rates)
            best_error_rate = (mean_error_rate if mean_error_rate < best_error_rate else best_error_rate)
            # print(f"Validation Loss: {val_loss:.4f}", flush=True)
            print(f"Validation Error Rate: {mean_error_rate:.4f}", flush=True)

            # Report validation loss value back to Optuna
            trial.report(mean_error_rate, epoch)

            # Handle pruning based on the loss value.
            if trial.should_prune():
                raise optuna.TrialPruned()
     
    return mean_error_rate

# run optuna
study.optimize(objective, n_trials=20, timeout=36000, show_progress_bar=True)

In [None]:
# Clear gpu cache
torch.cuda.empty_cache()

In [None]:
#hparams.update(study.trials[579].params)

In [None]:
hparams.update(study.best_trial.params)
#study.best_trial.params

In [None]:
hparams["dev_limit"] = None
hparams["early_stop_patience"] = 4
hparams["max_epochs"] = 40
hparams["lr"] = 1e-4

In [None]:
hateful_memes_model = HatefulMemesModel(hparams=hparams)

In [None]:
hateful_memes_model.fit()

In [None]:
# we should only have saved the best checkpoint
checkpoints = list(Path("model-outputs").glob("*.ckpt"))
assert len(checkpoints) == 1

checkpoints

In [None]:
submission = hateful_memes_model.make_submission_frame(test_path)
submission.head()

In [None]:
submission.groupby("label").proba.mean()

In [None]:
submission.label.value_counts()

In [None]:
submission.to_csv(("model-outputs/submission.csv"), index=True)

In [None]:
# Assuming you have the submission frame with "proba" and "label" columns
proba = torch.tensor(submission['proba'].values)
label = torch.tensor(test['label'].values)

# Calculate AUC-ROC score
auc_roc = roc_auc_score(label, proba)
print(f"AUC-ROC Score: {auc_roc}")

# Calculate accuracy
predictions = proba.round().long()
accuracy = accuracy_score(label, predictions)
print(f"Accuracy: {accuracy}")

In [None]:
#plot_contour(study)

In [None]:
plot_intermediate_values(study)

In [None]:
plot_optimization_history(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
plot_slice(study)

OCR (Optical Character Recognition)

In [None]:
!pip install pytesseract

In [None]:
import pytesseract

#image_path = "/kaggle/input/hate-memes/hateful_memes/img/42953.png"
#image_path = "/kaggle/input/hate-memes/hateful_memes/img/01236.png"
image_path = "/kaggle/input/hate-memes/hateful_memes/img/01243.png"

# Convert the image to grayscale
image = Image.open(image_path).convert("L")

image = image.filter(ImageFilter.MedianFilter())

text = pytesseract.image_to_string(image, lang='eng')
text_without_newlines = text.replace('\n', ' ')
print(text_without_newlines)

In [None]:
!pip install easyocr
!pip install --upgrade opencv-python

In [None]:
import easyocr
import cv2

reader = easyocr.Reader(['en'],gpu=True)

#image_path = "/kaggle/input/hate-memes/hateful_memes/img/42953.png"
image_path = "/kaggle/input/hate-memes/hateful_memes/img/01236.png"
#image_path = "/kaggle/input/hate-memes/hateful_memes/img/01243.png"

result = reader.readtext(image_path)

extracted_text = []
for detection in result:
    text = detection[1]
    # Perform post-processing to handle "o" and "0" confusion
    #text = text.replace('0', 'o')
    extracted_text.append(text)

text = " ".join(extracted_text)
# Print the extracted words
print(text)

In [None]:
!pip install python-doctr

In [None]:
!pip uninstall rapidfuzz -y
!pip install rapidfuzz==2.15.1

In [None]:
!pip uninstall pillow -y
!pip install pillow==7.1

In [None]:
!pip install tf2onnx

In [None]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)

# Load and process an image
#image_path = "/kaggle/input/hate-memes/hateful_memes/img/42953.png"
#image_path = "/kaggle/input/hate-memes/hateful_memes/img/01236.png"
#image_path = "/kaggle/input/hate-memes/hateful_memes/img/01243.png"

single_img_doc = DocumentFile.from_images(image_path)

# Perform OCR on the image
result = model(single_img_doc)

#result.show(single_img_doc)

In [None]:
# Extract the words from the Document object
words = []
for page in result.pages:
    for block in page.blocks:
        for line in block.lines:
            for word in line.words:
                words.append(word.value)
                
text = " ".join(words)
# Print the extracted words
print(text)

Chceking classification accuracy on train

In [None]:
def text_from_pytesseract(image_path):
    # Convert the image to grayscale
    image = Image.open(image_path).convert("L")

    image = image.filter(ImageFilter.MedianFilter())

    text = pytesseract.image_to_string(image, lang='eng')
    text_without_newlines = text.replace('\n', ' ')
    
    return text_without_newlines

In [None]:
def text_from_easyocr_cv2(image_path):
    reader = easyocr.Reader(['en'],gpu=True)
    
    result = reader.readtext(image_path)

    extracted_text = []
    for detection in result:
        text = detection[1]
        # Perform post-processing to handle "o" and "0" confusion
        #text = text.replace('0', 'o')
        extracted_text.append(text)

    text = " ".join(extracted_text)
    
    return text

In [None]:
def text_from_doctr(image_path):
    single_img_doc = DocumentFile.from_images(image_path)

    # Perform OCR on the image
    result = model(single_img_doc)
    
    # Extract the words from the Document object
    words = []
    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    words.append(word.value)
                
    text = " ".join(words)
    
    return text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calc_cosine_similarity(text1, text2):
    # Convert the texts into TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])

    # Calculate the cosine similarity between the vectors
    similarity = cosine_similarity(vectors)[0][1]
    
    return similarity

In [None]:
texts = []
pytesseract_cos_similarity = []
easyocr_cv2_cos_similarity = []
doctr_cos_similarity = []

# train from pd read json of train.
for index, row in train.head(100).iterrows():
    text = row['text']
    image = row['img']
    
    print(f"{index}:{text}")
    texts.append(text)
    
    image_path = f"/kaggle/input/hate-memes/hateful_memes/{image}"
    
    # pytesseract
    pytesseract_text = text_from_pytesseract(image_path)
    
    # print(pytesseract_text)
    
    pytesseract_cos_similarity.append(calc_cosine_similarity(text, pytesseract_text))
        
    # easyocr & cv2
    easyocr_cv2_text = text_from_easyocr_cv2(image_path)
    easyocr_cv2_cos_similarity.append(calc_cosine_similarity(text, easyocr_cv2_text))
    
    # print(easyocr_cv2_text)
        
    # doctr
    doctr_text = text_from_doctr(image_path)
    doctr_cos_similarity.append(calc_cosine_similarity(text, doctr_text))
    
    # print(doctr_text)
        
    print('######################################################')

print(f"pytesseract average cosine similarity: {(sum(pytesseract_cos_similarity) / len(pytesseract_cos_similarity)):.4f}")    
print(f"asyocr & cv2 average cosine similarity: {(sum(easyocr_cv2_cos_similarity) / len(easyocr_cv2_cos_similarity)):.4f}")
print(f"doctr average cosine similarity: {(sum(doctr_cos_similarity) / len(doctr_cos_similarity)):.4f}")