<a href="https://colab.research.google.com/github/sebi061/VideoAdEngagement/blob/main/2_Training_feature%20extraction%20models/4_Scene_detection_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Installations ###
#####################

!pip uninstall -y transformers
!pip install transformers==4.28.0 datasets evaluate

In [None]:
### Imports ###
###############

# general
import numpy as np
import pandas as pd
import os
import shutil
from PIL import Image
import random

# Vision transformer model and data loading
import torch
from transformers import ViTForImageClassification, ViTFeatureExtractor
from transformers import TrainingArguments, Trainer
from datasets import Dataset, load_metric

# dataset
from torchvision.datasets import SUN397

In [None]:
### Prepare dataset ###
#######################

# Load the dataset
root = os.path.expanduser('./')
dataset = SUN397(root, download=True)

Downloading https://vision.princeton.edu/projects/2010/SUN/SUN397.tar.gz to ./SUN397.tar.gz


100%|██████████| 39077296924/39077296924 [06:04<00:00, 107098509.45it/s]


Extracting ./SUN397.tar.gz to ./


In [None]:
# dict to match labels with respective directories
matching_dict = {'airport' : 'a/airport_terminal',
                 'alley' : 'a/alley',
                 'athlectic_field': 'a/athletic_field/outdoor',
                 'auditorium': 'a/auditorium',
                 'bar': 'b/bar',
                 'basketball_court': 'b/basketball_court/outdoor',
                 'bathroom': 'b/bathroom',
                 'bedroom': 'b/bedroom',
                 'beach': 'b/beach',
                 'bistro': 'b/bistro/indoor',
                 'canyon': 'c/canyon',
                 'computer_room': 'c/computer_room',
                 'desert': 'd/desert/sand',
                 'discotheque': 'd/discotheque',
                 'factory': 'f/factory/indoor',
                 'field': 'f/field/wild',
                 'forest' : 'f/forest/broadleaf',
                 'gym' : 'g/gymnasium/indoor',
                 'harbor': 'h/harbor',
                 'highway': 'h/highway',
                 'hill': 'h/hill',
                 'kitchen': 'k/kitchen',
                 'lake': 'l/lake/natural',
                 'library': 'l/library/indoor',
                 'living_room': 'l/living_room',
                 'locker_room': 'l/locker_room',
                 'market': 'm/market/outdoor',
                 'mountain': 'm/mountain',
                 'ocean': 'o/ocean',
                 'office': 'o/office',
                 'park': 'p/park',
                 'raceway': 'r/raceway',
                 'river': 'r/river',
                 'skatepark': 's/skatepark',
                 'snowfield': 's/snowfield',
                 'stadium': 's/stadium/football',
                 'street': 's/street',
                 'swimming_pool': 's/swimming_pool/indoor',
                 'tennis_court': 't/tennis_court/outdoor'}

In [None]:
### sample 100 images per category

# create empty lists to store values
original_path = []
image_id = []
label = []

# set seed to be reproductable
random.seed(42)

# loop through categories and sample 100 random pictures
for key, value in matching_dict.items():
  dir_list = os.listdir(os.path.join('./SUN397', value))
  random_sample = random.sample(dir_list, 100)

  # for random sample save original path, label and image id
  for path in random_sample:
    original_path.append(os.path.join('./SUN397', value, path))
    label.append(key)
    image_id.append(path)

In [None]:
# put everything together into pandas dataframe
scenes_df = pd.DataFrame({'image_id': image_id, 'original_path': original_path, 'label': label})

In [None]:
# move sampled images to new folder for model training
os.makedirs('./scenes_ds')
for source in scenes_df.original_path:
  shutil.move(source, './scenes_ds')

In [None]:
# check dataset
scenes_df.label.value_counts()

airport             100
office              100
lake                100
library             100
living_room         100
locker_room         100
market              100
mountain            100
ocean               100
park                100
hill                100
raceway             100
river               100
skatepark           100
snowfield           100
stadium             100
street              100
swimming_pool       100
kitchen             100
highway             100
alley               100
bistro              100
athlectic_field     100
auditorium          100
bar                 100
basketball_court    100
bathroom            100
bedroom             100
beach               100
canyon              100
harbor              100
computer_room       100
desert              100
discotheque         100
factory             100
field               100
forest              100
gym                 100
tennis_court        100
Name: label, dtype: int64

In [None]:
### remove grey color images to fit style of videos to be analyzed with model (grey scale videos very rare to non exitent in youtube shorts marketing campaings)
# collect indices of images with grey scale values
invalid_idx = []
for i, file in enumerate(scenes_df.image_id):
   color_count = Image.open(os.path.join('./scenes_ds', file)).getcolors()

   if color_count:
    invalid_idx.append(i)

# remove according to detected indices
scenes_df = scenes_df.drop(invalid_idx)

In [None]:
# bring into hugging face dataset format
ds = Dataset.from_pandas(scenes_df)
ds

Dataset({
    features: ['image_id', 'original_path', 'label', '__index_level_0__'],
    num_rows: 3891
})

In [None]:
# convert mem_cat to class variabel
ds = ds.class_encode_column("label")
ds.features['label']

Casting to class labels:   0%|          | 0/3891 [00:00<?, ? examples/s]

ClassLabel(names=['airport', 'alley', 'athlectic_field', 'auditorium', 'bar', 'basketball_court', 'bathroom', 'beach', 'bedroom', 'bistro', 'canyon', 'computer_room', 'desert', 'discotheque', 'factory', 'field', 'forest', 'gym', 'harbor', 'highway', 'hill', 'kitchen', 'lake', 'library', 'living_room', 'locker_room', 'market', 'mountain', 'ocean', 'office', 'park', 'raceway', 'river', 'skatepark', 'snowfield', 'stadium', 'street', 'swimming_pool', 'tennis_court'], id=None)

In [None]:
### load respective images into dataset in PIL format
# write function
def load_images(example):

  # open as pil image in RGB format
  pil_image = Image.open(os.path.join('./scenes_ds', example['image_id']))

  # if image in RGBA format, convert to RGB to fit required input dimensions of VIT model
  if pil_image.mode == 'RGBA':
    pil_image = pil_image.convert('RGB')

  return {'image' : pil_image}

# apply and check dataset
ds = ds.map(load_images)
ds

Map:   0%|          | 0/3891 [00:00<?, ? examples/s]

Dataset({
    features: ['image_id', 'original_path', 'label', '__index_level_0__', 'image'],
    num_rows: 3891
})

In [None]:
### Train Test Split ###
########################

ds_split = ds.train_test_split(test_size = 0.2, stratify_by_column= 'label', seed = 42)

In [None]:
# check dataset after splitting
ds_split

DatasetDict({
    train: Dataset({
        features: ['image_id', 'original_path', 'label', '__index_level_0__', 'image'],
        num_rows: 3112
    })
    test: Dataset({
        features: ['image_id', 'original_path', 'label', '__index_level_0__', 'image'],
        num_rows: 779
    })
})

In [None]:
### Feature Extraction ###
##########################

# load feature extractor
model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)



In [None]:
# write function to do feature extraction on the fly for batch in training process
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')

    inputs['labels'] = example_batch['label']
    return inputs

prepared_ds = ds_split.with_transform(transform)

In [None]:
# check if it works
prepared_ds

DatasetDict({
    train: Dataset({
        features: ['image_id', 'original_path', 'label', '__index_level_0__', 'image'],
        num_rows: 3112
    })
    test: Dataset({
        features: ['image_id', 'original_path', 'label', '__index_level_0__', 'image'],
        num_rows: 779
    })
})

In [None]:
# define cutome collate function
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [None]:
# define evaluation metric
metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
### Instantiate Vision Transformer Model ###
############################################

labels = ds_split['train'].features['label'].names

model_name_or_path = 'google/vit-base-patch16-224-in21k'

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
### Training ###
################

# define training arguments
training_args = TrainingArguments(
  output_dir="./scene_model",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=4,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  load_best_model_at_end=True,
  seed = 123
)

In [None]:
# instantiate trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=feature_extractor,
)

In [None]:
# training
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
100,1.7174,1.623186,0.82285
200,0.9016,0.982491,0.825417
300,0.4773,0.701975,0.844673
400,0.2401,0.562203,0.853659
500,0.1268,0.49502,0.872914
600,0.1029,0.485081,0.8819
700,0.0772,0.446111,0.884467


TrainOutput(global_step=780, training_loss=0.6604501724243164, metrics={'train_runtime': 950.2996, 'train_samples_per_second': 13.099, 'train_steps_per_second': 0.821, 'total_flos': 9.649401734236078e+17, 'train_loss': 0.6604501724243164, 'epoch': 4.0})

In [None]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# save best model
save_dir = '/content/drive/MyDrive/VideoAdEngagement/2_Training_feature extraction models/trained_models'
trainer.save_model(os.path.join(save_dir,'best_scene_detection_model'))