# Extract VIT Features from video frames using Google Colab GPU

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import ViTFeatureExtractor, ViTModel
from transformers import TrainingArguments, Trainer
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from PIL import Image
# import wandb
import numpy as np
import os
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score
import pickle

In [2]:
# Unzip images into google drive

from google.colab import drive
drive.mount('/content/drive')

source_dir = '/content/drive/MyDrive/Hate_Video_Classification_Data/Dataset/Dataset_Images_non_hate_videos.rar'

destination_dir = '/content/drive/MyDrive/Hate_Video_Classification_Data/Dataset/'

Mounted at /content/drive


In [3]:
# !sudo apt-get install unrar
# !unrar x {source_dir} {destination_dir}

# drive.flush_and_unmount()

# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [4]:
# !unzip -o {source_dir} -d {destination_dir}

# drive.flush_and_unmount()

# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [5]:
# checking if all files were extracted
import os
destination_dir_1 = f'{destination_dir}/Dataset_Images_non_hate_videos/'
len([name for name in os.listdir(f"{destination_dir_1}")])

652

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"using {device}")

using cuda


In [7]:
drive_project_root = "/content/drive/MyDrive/Hate_Video_Classification_Data/"
frames_path = f"{drive_project_root}/Dataset/Dataset_Images_non_hate_videos/"
VITF_landing_path = f"{drive_project_root}/Dataset/VITF/"
allVidList = os.listdir(frames_path)


In [8]:
# temp code to generate allVidList serially
# allVidList = [f"non_hate_video_{i}" for i in range(1, 653)]

In [9]:
minFrames = 100

In [10]:
def read_images(path:str, selected_folder:str) -> list[any]:
    """Function that samples and returns a list of PIL Image objects for each frame in a given video folder
    Inputs:
        path: path to dataset directory
        selected_folder: video_name
    Output:
        list of PIL Image objects for the video of length 100 (padded, if required)
    """
    X = []
    currFrameCount = 0
    videoFrameCount = len([name for name in os.listdir(os.path.join(path, selected_folder))])
    if videoFrameCount <= minFrames:
        for i in range(videoFrameCount):
            image = Image.open(os.path.join(path, selected_folder, 'frame_{}.jpg'.format(i)))

            X.append(image)
            currFrameCount += 1
            if(currFrameCount==minFrames):
                break
        paddingImage = Image.fromarray(np.zeros((100,100)), 'RGB')
        while currFrameCount < minFrames:
            X.append(paddingImage)
            currFrameCount+=1
        #X = torch.stack(X, dim=0)
    else:
        step = int(videoFrameCount/minFrames)
        for i in range(0,videoFrameCount,step):
            image = Image.open(os.path.join(path, selected_folder, 'frame_{}.jpg'.format(i)))
            X.append(image)
            currFrameCount += 1
            if(currFrameCount==minFrames):
                break
        paddingImage = Image.fromarray(np.zeros((100,100)), 'RGB')
        while currFrameCount < minFrames:
            X.append(paddingImage)
            currFrameCount+=1
        #X = torch.stack(X, dim=0)
    return X # all image arrays in a list



In [11]:
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTAttention(
          (attention): ViTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation(

In [12]:
# for folder in tqdm(allVidList):
#     if os.path.exists(f"{VITF_landing_path}"+folder+"_vit.p")==True:
#         continue
#     try:
#         video = read_images(frames_path, folder)
#         inputs = feature_extractor(images=video, return_tensors="pt")
#         inputs.to(device)
#         outputs = model(**inputs)
#         last_hidden_states = outputs.last_hidden_state
#         video_features =[(last_hidden_states[i][0].detach().numpy()) for i in range(0,100)]
#         with open(f"{VITF_landing_path}"+folder+"_vit.p", 'wb') as fp:
#             pickle.dump(video_features,fp)
#         del video, inputs, outputs, last_hidden_states, video_features
#     except:
#         print(f"Could not read frames for {folder}")
#         pass


In [13]:
for folder in tqdm(allVidList):
    if os.path.exists(f"{VITF_landing_path}"+folder+"_vit.p")==True:
        continue
    video = read_images(frames_path, folder)
    inputs = feature_extractor(images=video, return_tensors="pt")
    inputs.to(device)
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    video_features =[(last_hidden_states[i][0].detach().cpu().numpy()) for i in range(0,100)]
    with open(f"{VITF_landing_path}"+folder+"_vit.p", 'wb') as fp:
        pickle.dump(video_features,fp)
    del video, inputs, outputs, last_hidden_states, video_features


  0%|          | 0/652 [00:00<?, ?it/s]

In [14]:
drive.flush_and_unmount()