# [Task 2B: Multimodal Propagandistic Memes Classification](https://araieval.gitlab.io/task2/) at [ArabicNLP 2024](https://arabicnlp2024.sigarab.org/) @ACL 2024

@Author: Md. Arid Hasan

Given a meme (text overlayed image), the task is to detect whether the content is propagandistic.



### Downloading dataset from github

In [1]:
!wget https://gitlab.com/araieval/araieval_arabicnlp24/-/raw/main/task2/data/arabic_memes_propaganda_araieval_24_train.json
!wget https://gitlab.com/araieval/araieval_arabicnlp24/-/raw/main/task2/data/arabic_memes_propaganda_araieval_24_dev.json
!wget https://gitlab.com/araieval/araieval_arabicnlp24/-/raw/main/task2/data/arabic_memes_araieval_24_train_dev.tar.gz

--2024-03-22 06:07:23--  https://gitlab.com/araieval/araieval_arabicnlp24/-/raw/main/task2/data/arabic_memes_propaganda_araieval_24_train.json
Resolving gitlab.com (gitlab.com)... 172.65.251.78, 2606:4700:90:0:f22e:fbec:5bed:a9b9
Connecting to gitlab.com (gitlab.com)|172.65.251.78|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1027033 (1003K) [text/plain]
Saving to: ‘arabic_memes_propaganda_araieval_24_train.json’


2024-03-22 06:07:23 (16.3 MB/s) - ‘arabic_memes_propaganda_araieval_24_train.json’ saved [1027033/1027033]

--2024-03-22 06:07:24--  https://gitlab.com/araieval/araieval_arabicnlp24/-/raw/main/task2/data/arabic_memes_propaganda_araieval_24_dev.json
Resolving gitlab.com (gitlab.com)... 172.65.251.78, 2606:4700:90:0:f22e:fbec:5bed:a9b9
Connecting to gitlab.com (gitlab.com)|172.65.251.78|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 150732 (147K) [text/plain]
Saving to: ‘arabic_memes_propaganda_araieval_24_dev.json’


2024

#### Extract images

In [2]:

!tar -xvzf arabic_memes_araieval_24_train_dev.tar.gz

data/
data/images/
data/arabic_memes_fb_insta_pinterest/
data/fb_memes/
data/fb_memes/arabic_memes_officiel/
data/fb_memes/ArabMemes/
data/fb_memes/ArabianMemez/
data/fb_memes/ArabianMemez/278858577_1339875849853306_5941896038764612993_n.jpg
data/fb_memes/ArabianMemez/36526999_401564060351161_8715172479686934528_n.jpg
data/fb_memes/ArabianMemez/91360473_824623961378500_3826031840225918976_n.jpg
data/fb_memes/ArabianMemez/118206419_928062101034685_4055080010757517077_n.jpg
data/fb_memes/ArabianMemez/34157766_378837719290462_338446270488117248_n.jpg
data/fb_memes/ArabianMemez/339604724_240280365162274_8475128203519352213_n.jpg
data/fb_memes/ArabianMemez/90937780_821464438361119_600839849963945984_n.jpg
data/fb_memes/ArabianMemez/119461671_946759942498234_4488099900544799148_n.jpg
data/fb_memes/ArabianMemez/325596730_3561165430866879_9207462464562099758_n.jpg
data/fb_memes/ArabianMemez/272646015_1281287772378781_3628850103179921723_n.jpg
data/fb_memes/ArabianMemez/322699002_70750273423676

### installing required libraries.
 - transformers
 - datasets
 - evaluate
 - accelerate

In [3]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install --upgrade accelerate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

#### importing required libraries and setting up logger

In [4]:
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
import datasets
import evaluate
import numpy as np
from torchvision.transforms import Compose, Normalize, ToTensor, Resize, CenterCrop
from datasets import load_dataset, Dataset, DatasetDict
import torch

import transformers
from transformers import (
    ConvNextFeatureExtractor,
    ResNetConfig,
    ResNetForImageClassification,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version


logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

### Defining the training, validation, and test data

In [5]:
train_file = 'arabic_memes_propaganda_araieval_24_train.json'
validation_file = 'arabic_memes_propaganda_araieval_24_dev.json'
# test_file = 'arabic_memes_propaganda_araieval_24_test.json'

### Setting up the training parameters

In [6]:
training_args = TrainingArguments(
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    output_dir="./resnet_50/",
    overwrite_output_dir=True,
    remove_unused_columns=False,
    local_rank= 1,
    load_best_model_at_end=True,
    save_total_limit=2,
    save_strategy="no"
)

max_train_samples = None
max_eval_samples=None
max_predict_samples=None
batch_size = 16

In [7]:
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")

INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False

#### Defining the Model

In [8]:
model_name = 'resnet50'

#### setting the random seed

In [9]:
set_seed(training_args.seed)

#### Loading data files

In [10]:
import json
#from PIL import Image
import PIL
from datasets import Image
from tqdm import tqdm

# Image.open(obj['img_path']).convert("RGB")

def read_data(fpath, is_test=False):
  if is_test:
    data = {'id': [], 'image': []}
    js_obj = json.load(open(fpath, encoding='utf-8'))
    for obj in tqdm(js_obj):
      data['id'].append(obj['id'])
      data['image'].append(obj['img_path'])
  else:
    data = {'id': [], 'image': [], 'label': []}
    js_obj = json.load(open(fpath, encoding='utf-8'))
    for obj in tqdm(js_obj):
      data['id'].append(obj['id'])
      data['image'].append(obj['img_path'])
      data['label'].append(obj['class_label'])
  return pd.DataFrame.from_dict(data)


l2id = {'not_propaganda': 0, 'propaganda': 1}
train_df = read_data(train_file)
train_df['label'] = train_df['label'].map(l2id)
train_df = Dataset.from_pandas(train_df).cast_column("image", Image())
validation_df = read_data(validation_file)
validation_df['label'] = validation_df['label'].map(l2id)
validation_df = Dataset.from_pandas(validation_df).cast_column("image", Image())
# test_df = read_data(test_file)
# #test_df['label'] = test_df['label'].map(l2id)
# test_df = Dataset.from_pandas(test_df).cast_column("image", Image())



#data_files = {"train": train_df, "validation": validation_df, "test": validation_df}
data_files = {"train": train_df, "validation": validation_df}
for key in data_files.keys():
    logger.info(f"loading a local file for {key}")
raw_datasets = DatasetDict(
    {"train": train_df, "validation": validation_df} # , "test": test_df
)

100%|██████████| 2143/2143 [00:00<00:00, 485859.11it/s]
100%|██████████| 312/312 [00:00<00:00, 234100.69it/s]
INFO:__main__:loading a local file for train
INFO:__main__:loading a local file for validation


##### Extracting number of unique labels

In [11]:
# Labels
label_list = raw_datasets["train"].unique("label")
label_list.sort()  # sort the labels for determine
num_labels = len(label_list)

### Loading Pretrained Configuration, Tokenizer and Model

In [12]:
config = ResNetConfig(
        num_channels=1,
        layer_type="basic",
        depths=[2, 2],
        hidden_sizes=[32, 64],
        num_labels=num_labels,
)

model = ResNetForImageClassification(config)

#### Preprocessing the raw_datasets

In [13]:
feature_extractor = ConvNextFeatureExtractor(
    do_resize=True, do_normalize=False, image_mean=[0.45], image_std=[0.22]
)
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
_transforms = Compose([Resize(256), CenterCrop(224), ToTensor(), normalize])

def transforms(example_batch):
    """Apply _train_transforms across a batch."""
    # print(example_batch)
    # black and white
    example_batch["pixel_values"] = [_transforms(pil_img.convert("L")) for pil_img in example_batch["image"]]
    return example_batch



#### Finalize the training data for training the model

In [14]:
if "train" not in raw_datasets:
    raise ValueError("requires a train dataset")
train_dataset = raw_datasets["train"]
if max_train_samples is not None:
    max_train_samples_n = min(len(train_dataset), max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples_n))
train_dataset.set_transform(transforms)

In [15]:
train_dataset

Dataset({
    features: ['id', 'image', 'label'],
    num_rows: 2143
})

#### Finalize the development/evaluation data for evaluating the model

In [16]:
if "validation" not in raw_datasets:
    raise ValueError("requires a validation dataset")
eval_dataset = raw_datasets["validation"]
if max_eval_samples is not None:
    max_eval_samples_n = min(len(eval_dataset), max_eval_samples)
    eval_dataset = eval_dataset.select(range(max_eval_samples_n))
eval_dataset.set_transform(transforms)

#### Finalize the test data for predicting the unseen test data using the model

In [None]:
if "test" not in raw_datasets and "test_matched" not in raw_datasets:
    raise ValueError("requires a test dataset")
predict_dataset = raw_datasets["test"]
if max_predict_samples is not None:
    max_predict_samples_n = min(len(predict_dataset), max_predict_samples)
    predict_dataset = predict_dataset.select(range(max_predict_samples_n))
predict_dataset = predict_dataset.set_transform(transforms)

#### Log a few random samples from the training set

In [18]:
for index in random.sample(range(len(train_dataset)), 3):
    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

INFO:__main__:Sample 456 of the training set: {'id': 'data/arabic_memes_fb_insta_pinterest/Facebook/images/sanat.meme_/79517291_1444731972385670_3984977326792453161_n.jpg', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=526x832 at 0x7E845758ED70>, 'label': 1, 'pixel_values': tensor([[[ 0.0579,  0.5927,  0.0045,  ..., -1.3146, -1.2255, -1.2077],
         [-0.1203, -0.1916, -0.2451,  ..., -1.4929, -1.2255, -1.1720],
         [-0.4590, -0.4590, -0.1381,  ..., -1.2968, -1.3859, -1.2611],
         ...,
         [ 0.0401, -0.3520, -0.5303,  ..., -0.9581, -0.9581, -0.9759],
         [ 0.0936, -0.3699, -0.6016,  ..., -0.9938, -0.9938, -0.9938],
         [ 0.2184, -0.2094, -0.5660,  ..., -1.0116, -1.0116, -0.9938]]])}.
INFO:__main__:Sample 102 of the training set: {'id': 'data/arabic_memes_fb_insta_pinterest/Pinterest/images/pinterest_image_august13/www.pinterest.com_pin_864409722250727930/4e94ecf08d54e8922e9f8fc0bcdee1cf.jpg', 'image': <PIL.JpegImagePlugin.JpegImageFile image 

#### Get the metric function `accuracy`

In [19]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

#### Predictions and label_ids field and has to return a dictionary string to float.

In [20]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}


#### Data Collator

In [21]:
def collate_fn(examples):
    # print(examples)
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}
data_collator = collate_fn

#### Initialize our Trainer

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset, # if you have development and test set, uncomment this line
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


#### Training our model

In [23]:
train_result = trainer.train()
metrics = train_result.metrics
max_train_samples = (
    max_train_samples if max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))



[INFO|trainer.py:1812] 2024-03-22 06:11:21,047 >> ***** Running training *****
[INFO|trainer.py:1813] 2024-03-22 06:11:21,048 >>   Num examples = 2,143
[INFO|trainer.py:1814] 2024-03-22 06:11:21,053 >>   Num Epochs = 2
[INFO|trainer.py:1815] 2024-03-22 06:11:21,055 >>   Instantaneous batch size per device = 16
[INFO|trainer.py:1818] 2024-03-22 06:11:21,058 >>   Total train batch size (w. parallel, distributed & accumulation) = 16
[INFO|trainer.py:1819] 2024-03-22 06:11:21,059 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1820] 2024-03-22 06:11:21,061 >>   Total optimization steps = 268
[INFO|trainer.py:1821] 2024-03-22 06:11:21,063 >>   Number of trainable parameters = 183,554


Step,Training Loss


[INFO|trainer.py:2067] 2024-03-22 06:20:09,039 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)




#### Saving the tokenizer too for easy upload

In [24]:
trainer.save_model()
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

[INFO|trainer.py:3067] 2024-03-22 06:20:09,069 >> Saving model checkpoint to ./resnet_50/
[INFO|configuration_utils.py:473] 2024-03-22 06:20:09,077 >> Configuration saved in ./resnet_50/config.json
[INFO|modeling_utils.py:2454] 2024-03-22 06:20:09,089 >> Model weights saved in ./resnet_50/model.safetensors
[INFO|image_processing_utils.py:257] 2024-03-22 06:20:09,092 >> Image processor saved in ./resnet_50/preprocessor_config.json


***** train metrics *****
  epoch                    =        2.0
  total_flos               =   220578GF
  train_loss               =     0.6074
  train_runtime            = 0:08:47.97
  train_samples            =       2143
  train_samples_per_second =      8.118
  train_steps_per_second   =      0.508


#### Evaluating our model on validation/development data

In [25]:
logger.info("*** Evaluate ***")

metrics = trainer.evaluate(eval_dataset=eval_dataset)

max_eval_samples = (
    max_eval_samples if max_eval_samples is not None else len(eval_dataset)
)
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

INFO:__main__:*** Evaluate ***
[INFO|trainer.py:3376] 2024-03-22 06:20:09,119 >> ***** Running Evaluation *****
[INFO|trainer.py:3378] 2024-03-22 06:20:09,123 >>   Num examples = 312
[INFO|trainer.py:3381] 2024-03-22 06:20:09,126 >>   Batch size = 16


***** eval metrics *****
  epoch                   =        2.0
  eval_accuracy           =     0.7083
  eval_loss               =     0.5812
  eval_runtime            = 0:00:14.76
  eval_samples            =        312
  eval_samples_per_second =     21.131
  eval_steps_per_second   =      1.355


### Predecting the test data

In [26]:
# if the test set is available, you don't need to run this cell
predict_dataset = eval_dataset
predict_dataset

Dataset({
    features: ['id', 'image', 'label'],
    num_rows: 312
})

In [27]:
id2l = {0:'not_propaganda', 1:'propaganda'}
logger.info("*** Predict ***")
#predict_dataset = predict_dataset.remove_columns("label")
#ids = predict_dataset['id']
#image = predict_dataset['image']
#predict_dataset = predict_dataset.remove_columns("id")
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
predictions = np.argmax(predictions, axis=1)
output_predict_file = os.path.join(training_args.output_dir, f"task2B_TeamName.tsv")
if trainer.is_world_process_zero():
    with open(output_predict_file, "w") as writer:
        logger.info(f"***** Predict results *****")
        writer.write("id\tlabel\trun_id\n")
        for index, item in enumerate(predictions):
            item = label_list[item]
            item = id2l[item]
            writer.write(f"{predict_dataset[index]['id']}\t{item}\t{model_name}\n")

INFO:__main__:*** Predict ***
[INFO|trainer.py:3376] 2024-03-22 06:20:23,925 >> ***** Running Prediction *****
[INFO|trainer.py:3378] 2024-03-22 06:20:23,927 >>   Num examples = 312
[INFO|trainer.py:3381] 2024-03-22 06:20:23,931 >>   Batch size = 16


INFO:__main__:***** Predict results *****


#### Saving the model into card

In [None]:
kwargs = {"finetuned_from": model_name, "tasks": "text-classification"}
trainer.create_model_card(**kwargs)

[INFO|modelcard.py:450] 2024-03-20 05:28:44,933 >> Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.7916666865348816}]}
