<a href="https://colab.research.google.com/github/sebi061/VideoAdEngagement/blob/main/2_Training_feature%20extraction%20models/2_Memorability_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Installations ###
#####################

!pip uninstall -y transformers
!pip install transformers==4.28.0 datasets evaluate

[0mCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_

In [None]:
### Imports ###
###############

# general
import numpy as np
import pandas as pd
import os
import shutil
from PIL import Image

# Vision transformer model and dataset object
import torch
from transformers import ViTForImageClassification, ViTFeatureExtractor
from transformers import TrainingArguments, Trainer
from datasets import Dataset, load_metric

In [None]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# set data directory
data_dir = '/content/drive/MyDrive/VideoAdEngagement/2_Training_feature extraction models/Data'
save_dir = '/content/drive/MyDrive/VideoAdEngagement/2_Training_feature extraction models/trained_models'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
### Load dataset ###
####################

# copy and unpack
shutil.copy(os.path.join(data_dir, 'download_lamem.tar.gz'), './')
shutil.unpack_archive('./download_lamem.tar.gz', extract_dir = './')

In [None]:
# load dataframe
df_lamem = pd.read_csv('./lamem/splits/train_1.txt', sep=" ", names = ['img_file', 'mem_score'])

In [None]:
### Dataset preparation and pre-processing ###
##############################################

# assign categories -> choose so that enough data in each bin and some clear separation between
def assign_mem_cat(row):
  if row['mem_score'] <= 0.5:
    value = 'low_mem'
  elif row['mem_score'] > 0.6 and row['mem_score'] <= 0.7:
    value = 'medium_mem'
  elif row['mem_score'] > 0.9:
    value = 'high_mem'
  else:
    value = 'delete'

  return value

df_lamem['mem_cat'] = df_lamem.apply(assign_mem_cat, axis=1)

In [None]:
# remove entries that are outside of sclected mem_score bins
df_lamem = df_lamem.loc[df_lamem['mem_cat'] !='delete'].reset_index()

In [None]:
# check category distribution
df_lamem['mem_cat'].value_counts()

medium_mem    8226
high_mem      4983
low_mem       1647
Name: mem_cat, dtype: int64

In [None]:
# take a subset of the whole dataframe to make training faster (sufficiant amount of data to fine-tune pre-trained vision transformer model)
df_lamem_subset = df_lamem.groupby('mem_cat').sample(n = 1500, random_state = 42).reset_index(drop = True)

In [None]:
# check category distribution of the subset dataframe
df_lamem_subset['mem_cat'].value_counts()

high_mem      1500
low_mem       1500
medium_mem    1500
Name: mem_cat, dtype: int64

In [None]:
# bring into hugging face dataset format
ds = Dataset.from_pandas(df_lamem_subset)
ds

Dataset({
    features: ['index', 'img_file', 'mem_score', 'mem_cat'],
    num_rows: 4500
})

In [None]:
# convert mem_cat to class variabel
ds = ds.class_encode_column("mem_cat")
ds.features['mem_cat']

Casting to class labels:   0%|          | 0/4500 [00:00<?, ? examples/s]

ClassLabel(names=['high_mem', 'low_mem', 'medium_mem'], id=None)

In [None]:
# load respective images into dataset in PIL format
def load_images(example):
  pil_image = Image.open(os.path.join('./lamem/images', example['img_file']))

  # check if grey scale and convert to RGB if true
  cc = pil_image.getcolors()
  if cc:
    pil_image = pil_image.convert('RGB')

  return {'image' : pil_image}

ds = ds.map(load_images)
ds

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Dataset({
    features: ['index', 'img_file', 'mem_score', 'mem_cat', 'image'],
    num_rows: 4500
})

In [None]:
### Train Test Split ###
########################

ds_split = ds.train_test_split(test_size = 0.2, stratify_by_column= 'mem_cat', seed = 42)

In [None]:
ds_split

DatasetDict({
    train: Dataset({
        features: ['index', 'img_file', 'mem_score', 'mem_cat', 'image'],
        num_rows: 3600
    })
    test: Dataset({
        features: ['index', 'img_file', 'mem_score', 'mem_cat', 'image'],
        num_rows: 900
    })
})

In [None]:
### Feature Extraction ###
##########################

# load feature extractor
model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [None]:
# write function to do feature extraction on the fly for batch in training process
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')

    inputs['labels'] = example_batch['mem_cat']
    return inputs

prepared_ds = ds_split.with_transform(transform)

In [None]:
# check if it works
prepared_ds

DatasetDict({
    train: Dataset({
        features: ['index', 'img_file', 'mem_score', 'mem_cat', 'image'],
        num_rows: 3600
    })
    test: Dataset({
        features: ['index', 'img_file', 'mem_score', 'mem_cat', 'image'],
        num_rows: 900
    })
})

In [None]:
# define cutome collate function
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [None]:
# define evaluation metric
metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
### Instantiate Vision Transformer Model ###
############################################

labels = ds_split['train'].features['mem_cat'].names

model_name_or_path = 'google/vit-base-patch16-224-in21k'

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
### Training ###
################

# define training arguments
training_args = TrainingArguments(
  output_dir="./laMemModel",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=4,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-5,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  load_best_model_at_end=True,
  seed = 123
)

In [None]:
# instantiate trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=feature_extractor,
)

In [None]:
# training
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
100,0.9071,0.890373,0.657778
200,0.7499,0.707638,0.687778
300,0.5747,0.658759,0.693333
400,0.6054,0.637721,0.707778
500,0.4924,0.645122,0.708889
600,0.4068,0.636307,0.708889
700,0.4494,0.634341,0.711111
800,0.3737,0.63724,0.714444
900,0.3546,0.642221,0.711111


TrainOutput(global_step=900, training_loss=0.588798910776774, metrics={'train_runtime': 572.4363, 'train_samples_per_second': 25.156, 'train_steps_per_second': 1.572, 'total_flos': 1.1158946517712896e+18, 'train_loss': 0.588798910776774, 'epoch': 4.0})

In [None]:
# save best model
trainer.save_model(os.path.join(save_dir,'best_memorability_model'))