<a href="https://colab.research.google.com/github/sebi061/VideoAdEngagement/blob/main/2_Training_feature%20extraction%20models/1_1_Intent_recognition_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Installations ###
#####################

!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-v0bpm8oj
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-v0bpm8oj
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369370 sha256=460ef674c9f5672444590bf3604247ff29db4fbfd898ea159140d31319f4a85a
  Stored in directory: /tmp/pip-ephem-wheel-cache-4micrqxr/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac13b112deb897d5b50f5ad9a37e4
Successfully built clip
Inst

In [None]:
### Imports ###
###############

# general
import numpy as np
import pandas as pd
import json
import os
import shutil
import statistics as stat

# image processing
from PIL import Image

# clip model
import clip

# optimization and dataloading
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

In [None]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# set data directory
data_dir = '/content/drive/MyDrive/VideoAdEngagement/2_Training_feature extraction models/Data'
save_dir = '/content/drive/MyDrive/VideoAdEngagement/2_Training_feature extraction models/trained_models'

Mounted at /content/drive


In [None]:
### Import Train Data ###
#########################

# csv
data_path_train = os.path.join(data_dir, 'Intentonomy_train_subset.csv')
df_train = pd.read_csv(data_path_train)

data_path_test = os.path.join(data_dir, 'Intentonomy_test.csv')
df_test = pd.read_csv(data_path_test)

In [None]:
# check dataframe
df_train['category'].value_counts()

familiy                               100
financial and occupational success    100
health                                100
openness to experience                100
power                                 100
security and belonging                100
self-fulfill                          100
virtue                                 99
ambition and ability                   98
Name: category, dtype: int64

In [None]:
#df_test['category'].value_counts()

In [None]:
# load images
shutil.copy(os.path.join(data_dir, 'Intentonomy_train.zip'), './')
shutil.copy(os.path.join(data_dir, 'Intentonomy_test.zip'), './')

os.makedirs('Intentonomy_train')
os.makedirs('Intentonomy_test')

shutil.unpack_archive('./Intentonomy_train.zip', extract_dir = './Intentonomy_train')
shutil.unpack_archive('./Intentonomy_test.zip', extract_dir = './Intentonomy_test')

In [None]:
### Load and instantiate model ###
##################################

# set device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
# Load model
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) # set jit=False for training

In [None]:
#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()
        p.grad.data = p.grad.data.float()

In [None]:
### Create Custome Dataset ###
##############################

class IntentonomyDataset(Dataset):
  def __init__(self, df, img_dir, transform=None):
    self.img_dir = img_dir
    self.file_name = df['image_id'].values
    self.label = df['category']
    self.transform = transform

  def __getitem__(self, index):
    img = preprocess(Image.open(os.path.join(self.img_dir,
                                  self.file_name[index] + '.jpeg')))

    if self.transform is not None:
      img = self.transform(img)

    label = self.label[index]

    return img, label

  def __len__(self):
    return self.file_name.shape[0]

In [None]:
### Dataset and Dataloader ###
##############################

BATCH_SIZE = 25


train_dataset = IntentonomyDataset(df = df_train,
                                   img_dir = './Intentonomy_train')

test_dataset = IntentonomyDataset(df = df_test,
                                   img_dir = './Intentonomy_test')


train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          num_workers=2)


test_loader = DataLoader(dataset=test_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          num_workers=2)

In [None]:
### Loss and optimizer ###
##########################

# Clip loss
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

def clip_loss(img, txt):
  logits_per_image, logits_per_text = model(img, txt)

  ground_truth = torch.arange(len(img),dtype=torch.long,device=device)

  total_loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth))/2

  return total_loss

In [None]:
# optimizer
optimizer = optim.Adam(model.parameters(), lr=5e-6,betas=(0.9,0.98),eps=1e-6,weight_decay=0.3) #Params used from paper, the lr is smaller, more safe for fine tuning to new dataset

In [None]:
### Labels ###
##############

labels_list = ['virtue','self-fulfill', 'openness to experience', 'security and belonging',
          'power', 'health', 'familiy', 'ambition and ability', 'financial and occupational success']

# labels dicts
id2label = dict(enumerate(labels_list))
label2id = {label: id for id, label in id2label.items()}

print(id2label)
print(label2id)

{0: 'virtue', 1: 'self-fulfill', 2: 'openness to experience', 3: 'security and belonging', 4: 'power', 5: 'health', 6: 'familiy', 7: 'ambition and ability', 8: 'financial and occupational success'}
{'virtue': 0, 'self-fulfill': 1, 'openness to experience': 2, 'security and belonging': 3, 'power': 4, 'health': 5, 'familiy': 6, 'ambition and ability': 7, 'financial and occupational success': 8}


In [None]:
### Training loop ###
#####################

EPOCH = 4
torch.manual_seed(12)

for epoch in range(EPOCH):

  for batch in train_loader :

      optimizer.zero_grad()

      images, label = batch

      images = images.to(device)

      label_tok = clip.tokenize([f"The picture represents {l}" for l in label])
      texts = label_tok.to(device)

      loss = clip_loss(images, texts)

      loss.backward()

      if device == "cpu":
          optimizer.step()
      else :
         convert_models_to_fp32(model)
         optimizer.step()
         clip.model.convert_weights(model)


  ### Logging###
  ##############


  print(f"Epoch: {epoch + 1}, loss: {loss}")

Epoch: 1, loss: 2.900390625
Epoch: 2, loss: 2.5078125
Epoch: 3, loss: 2.45703125
Epoch: 4, loss: 1.6982421875


In [None]:
### Validation Train Set ###
############################
labels_list_tok = clip.tokenize([f"The picture represents {l}" for l in labels_list]).to(device)

d = train_dataset
def validation():

  acc = []

  for i in range(int(len(d))):

    test_img = d[i][0]
    label = d[i][1]
    img = test_img.unsqueeze(0).to(device)


    with torch.no_grad():
            p , _ = model(img, labels_list_tok)
            p = p.softmax(dim=-1)
            assigned_label = p.argmax()



    acc.append(id2label[int(assigned_label)] == label)

  acc = np.sum(acc) / (len(d))

  return(acc)

val_acc = validation()

print(f"Validation accuracy: {val_acc}")

Validation accuracy: 0.9520624303232998


In [None]:
### Validation ###
##################
labels_list_tok = clip.tokenize([f"The picture represents {l}" for l in labels_list]).to(device)

d = test_dataset
def validation():

  acc = []

  for i in range(int(len(d))):

    test_img = d[i][0]
    label = d[i][1]
    img = test_img.unsqueeze(0).to(device)


    with torch.no_grad():
            p , _ = model(img, labels_list_tok)
            p = p.softmax(dim=-1)

    # since multiple labels are assigned (overlapping in meaning) by different annotators in Intentonomy data collection process:
    # - > check overlap of of 3 predicted labels with highest probability and assigned labels to measure accuracy
    l1 = [id2label[int(i)] for i in p.argsort()[0][-3:]]
    l2 = d[i][1].strip("][").replace("'", "").split(", ")

    acc.append(not set(l1).isdisjoint(l2))

  acc = np.sum(acc) / (len(d))

  return(acc)

val_acc = validation()

print(f"Validation accuracy: {val_acc}")

Validation accuracy: 0.8333333333333334


In [None]:
### Save model ###
##################

torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        }, os.path.join(save_dir, 'best_intentonomy_model.pt'))