In [None]:
#!pip install opencv
!pip install datasets
!pip install transformers
!pip install pandas
!pip install torchvision
!pip install albumentations
#!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
!pip install ipdb


In [1]:
import torch
from torchvision import transforms 
from torchvision import models as models

import pandas
import json
import argparse
import logging
import os
import pickle

import numpy as np
import torch.nn as nn

from transformers import CLIPModel, CLIPProcessor, AutoModel, AutoConfig
from transformers import Trainer, TrainingArguments

from transformers.modeling_outputs import SequenceClassifierOutput

from datasets import load_dataset, load_metric, DownloadConfig, load_from_disk, DatasetDict

from sklearn import metrics


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## CONFIGURATION

data_folder = os.path.expanduser('~/code/clevr-poc/data')
constraint_types_tensor_file_path = os.path.join(data_folder, 'constraint_types_tensor.pickle')
properties_file_path = os.path.join(data_folder, 'properties.json')

model_path = "openai/clip-vit-base-patch32"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
if torch.cuda.is_available():
    torch.cuda.empty_cache()

train_batch_size = 8
eval_batch_size = 8
num_workers = 8
pin_memory=8
gradient_accumulation=4
epochs = 20

max_length = 42

dropout = 0.1



cuda:0


In [3]:
# get the number of classes
with open (properties_file_path, 'rb') as f:
    properties = json.load(f)

total_number_of_classes = 0
for key, value in properties.items():
    total_number_of_classes =  total_number_of_classes + len(value.keys())
 

In [4]:
# get constraints tensors

with open (constraint_types_tensor_file_path, 'rb') as f:
    constraint_types_tensor = pickle.load(f)
num_constraint_types = len(constraint_types_tensor)

print(num_constraint_types)

30


In [5]:
class ImageConstraintTypeClassification(nn.Module):
    def __init__(self, device, input_dim=2048, output_dim=None):
        super(ImageConstraintTypeClassification,self).__init__()    
        self.device = device
        self.model = models.resnet50(progress=True, pretrained=True)

        for param in self.model.parameters():
            param.requires_grad = False

        self.model.fc = nn.Linear(2048, num_constraint_types)
        #self.model = nn.Linear(input_dim, output_dim)
        self.model.to(self.device)
    
    def forward(self, x):
        output = self.model(x)
        return output

class ClipClassification(nn.Module):
    def __init__(self, 
                 device, 
                 checkpoint, 
                 clip_embedding_size,
                 ctype_embedding_size,
                 output_dim,
                 num_constraint_types):
        
        super(ClipClassification,self).__init__()
        
        self.device = device
        self.output_dim = output_dim
        

        #self.ctype_classifier = ImageConstraintTypeClassification(device, input_dim=clip_embedding_size, output_dim=num_constraint_types)
        self.ctype_classifier = ImageConstraintTypeClassification(device=device, output_dim=num_constraint_types)
        
        self.clip_model = CLIPModel.from_pretrained(checkpoint)
        #for param in self.clip_model.parameters():
        #    param.requires_grad = False
        self.clip_model.to(self.device)
        self.dropout = nn.Dropout(0.1)
        
        input_dim = clip_embedding_size*2 + ctype_embedding_size
        self.classifier = nn.Linear(input_dim, self.output_dim) # load and initialize weights
        self.classifier.to(self.device)


    # define a function that returns the tensor of a specific constraint type
    def get_tensor(constraint_type):
        return torch.flatten(constraint_types_tensor[constraint_type])
   
        
    def forward(self, input_ids=None, attention_mask=None, pixel_values=None, labels=None, constraint_type=None, image=None):
    #def forward(self, input_ids=None, attention_mask=None, pixel_values=None, labels=None):
               
        outputs = self.clip_model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        text_emb = outputs['text_embeds']    #batchx512
        image_emb = outputs['image_embeds']  #batchx512
               
        ctype_classification_output = self.ctype_classifier(image)
        v, predicted_constraint_type = torch.max(ctype_classification_output, 1)
                
        
        #constraint_type_list = constraint_type.tolist()
        constraint_type_list = predicted_constraint_type.tolist()
        constraint_type_embedding = list(map(ClipClassification.get_tensor, constraint_type_list))
        constraint_type_embedding = torch.stack([x for x in constraint_type_embedding], dim=0).to(self.device)
        #Add custom layers
   
        emb = torch.cat([text_emb,image_emb,constraint_type_embedding], dim=1)  
        
        emb = self.dropout(emb)


        logits = self.classifier(emb) # calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.output_dim), labels.view(-1))

        hidden = outputs['text_model_output']['last_hidden_state']
        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=hidden,attentions=None)

In [6]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler()])

logging.info("Loading dataset")

dl_config = DownloadConfig(resume_download=True, num_proc=4)
import ipdb; ipdb.set_trace()
logging.info('Loading training data')
dataset_train = load_dataset('clevr-poc-loader.py',
                       name='clevr-poc',
                       download_config=dl_config,
                       split='train[:]')
logging.info('Loading validation data')
dataset_val = load_dataset('clevr-poc-loader.py',
                       name='clevr-poc',
                       download_config=dl_config,
                       split='validation[:]')
logging.info('Loading test data')
dataset_test = load_dataset('clevr-poc-loader.py',
                       name='clevr-poc',
                       download_config=dl_config,
                       split='test[:]')

logging.info('Dataset loaded')

dataset = DatasetDict({
  'train':dataset_train,
  'validation':dataset_val,
  'test':dataset_test
})

logging.info('Loading CLIP')
model_path = "openai/clip-vit-base-patch32"

#TODO convert CLEVR images offline
extractor = CLIPProcessor.from_pretrained(model_path)

def image_traform(e):
    convert_tensor = transforms.ToTensor()
    return convert_tensor(e)    

def transform_tokenize(e):
    e['image'] = [image.convert('RGB') for image in e['image']]
    
    """
    return extractor(text=e['question'],
                               images=e['image'],
                               truncation=True, 
                               #padding=True)
                               padding="max_length", max_length=42)
    """
    
    token = extractor(text=e['question'],
                               images=e['image'],
                               truncation=True, 
                               #padding=True)
                               padding="max_length", max_length=42)
    
    token['image'] = list(map(image_traform, e['image']))
    return token
    

    
    
logging.info('Transforming dataset')
dataset = dataset.map(transform_tokenize, batched=True, num_proc=1)


2022-08-10 09:03:31,072 [INFO] Loading dataset


--Return--
None
> [0;32m/tmp/ipykernel_1230681/1818158740.py[0m(6)[0;36m<cell line: 6>[0;34m()[0m
[0;32m      5 [0;31m[0mdl_config[0m [0;34m=[0m [0mDownloadConfig[0m[0;34m([0m[0mresume_download[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mnum_proc[0m[0;34m=[0m[0;36m4[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m[0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      7 [0;31m[0mlogging[0m[0;34m.[0m[0minfo[0m[0;34m([0m[0;34m'Loading training data'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c


2022-08-10 09:03:32,538 [INFO] Loading training data
2022-08-10 09:03:32,576 [INFO] Loading validation data
2022-08-10 09:03:32,590 [INFO] Loading test data
2022-08-10 09:03:32,607 [INFO] Dataset loaded
2022-08-10 09:03:32,608 [INFO] Loading CLIP
2022-08-10 09:03:37,034 [INFO] Transforming dataset
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [06:07<00:00, 30.65s/ba]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [04:18<00:00, 129.26s/ba]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [04:15<00:00, 127.93s/ba]


In [7]:
metric = load_metric('accuracy')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits[:-1], axis=-1)[0]
    return metric.compute(predictions=predictions, references=labels)

clip_embedding_size = 512
ctype_embedding_size = constraint_types_tensor[0].shape[0]*constraint_types_tensor[0].shape[1]
model = ClipClassification(device=device, 
                           checkpoint=model_path,
                           clip_embedding_size = clip_embedding_size,
                           ctype_embedding_size = ctype_embedding_size,
                           output_dim=total_number_of_classes, 
                           num_constraint_types=num_constraint_types)




In [8]:
logging.info("Creating trainer")
training_args = TrainingArguments("test_trainer",
                                    num_train_epochs=epochs,
                                    per_device_train_batch_size=train_batch_size,
                                    per_device_eval_batch_size=eval_batch_size,
                                    fp16=True if device == 'cuda' else False,
                                    dataloader_num_workers=num_workers ,
                                    dataloader_pin_memory=pin_memory,
                                    gradient_accumulation_steps=gradient_accumulation,
                                    save_strategy='no',
                                    evaluation_strategy='epoch',
                                    eval_steps=1)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics,
)



2022-08-10 09:18:27,545 [INFO] Creating trainer


In [9]:
logging.info("Training model")
training_metrics = trainer.train()
logging.info(training_metrics)

2022-08-10 09:18:27,556 [INFO] Training model
The following columns in the training set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12000
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 7500


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.339358,0.3
2,2.470500,2.08992,0.314167
3,2.086500,1.919694,0.413333
4,1.875900,1.808126,0.399167
5,1.875900,1.722678,0.394167
6,1.742500,1.656103,0.413333
7,1.640900,1.596379,0.425833
8,1.556200,1.550413,0.4425
9,1.556200,1.510958,0.445833
10,1.488500,1.480878,0.455


	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The following columns in the evaluation set don't have a corresponding argument in `ClipClassification.forward` and have been ignored: question. If question are not expected by `ClipClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 8


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



Training completed. Do not forget to share your model on huggingface.co/models =)


2022-08-10 15:11:48,931 [INFO] TrainOutput(global_step=7500, training_loss=1.5482572591145833, metrics={'train_runtime': 21201.3558, 'train_samples_per_second': 11.32, 'train_steps_per_second': 0.354, 'total_flos': 0.0, 'train_loss': 1.5482572591145833, 'epoch': 20.0})


In [None]:
predictions, labels, test_metrics = trainer.predict(dataset['test'])
y_true = dataset['test']['label']                                                                                                                 
y_pred = np.argmax(predictions[:-1], axis=-1)[0]                                                                                                    
confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=[i for i in range(total_number_of_classes)])                                                                                                            
print(confusion_matrix)
logging.info(test_metrics)