In [None]:
# code to mount my drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# install the necessary requirements
!pip3 install timm==0.4.12 fairscale==0.4.4
!pip3 install transformers
!pip install pycocoevalcap
!pip install ruamel.yaml

**This section is for running an inference instant of BLIP**



In [None]:
%cd /content/drive/MyDrive/My_Software_Projects/BLIP
!pip install -Uqq ipdb
import ipdb

In [None]:
%pdb on

In [None]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_demo_image(image_size,device):
    img_url = '/content/drive/MyDrive/My_Software_Projects/Input_Frames/time0_frame1.jpg'
    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

    w,h = raw_image.size
    display(raw_image.resize((w//5,h//5)))

    transform = transforms.Compose([
        transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ])
    image = transform(raw_image).unsqueeze(0).to(device)
    return image

**Code to run an inference on the BLIP image captioning network from a video file**

In [None]:
from models.blip import blip_decoder
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
import cv2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_demo_image(image_size,device, raw_image):

    # img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
    # raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
    img_url = '/content/drive/My Drive/My_Software_Projects/Input_Frames/time0_frame1.jpg'
    # raw_image = Image.open(img_url).convert('RGB')
    # raw_image = cv2.imread(img_url)
    raw_image = raw_image
    raw_image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB)
    raw_image = Image.fromarray(raw_image)


    w,h = raw_image.size
    display(raw_image.resize((w//5,h//5)))
    #cv2.imshow('image', raw_image)

    transform = transforms.Compose([
        transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ])
    image = transform(raw_image).unsqueeze(0).to(device)
    return image



#path to input Video File
pathToInputVideoFile = '/content/drive/My Drive/My_Software_Projects/Input_Video/InputVideo2.mp4'

#path to the output Folder
pathToOutputFramesFolder = '/content/drive/My Drive/My_Software_Projects/Output/Output_Caption/'

# path to the output text File
pathToOutputTxtFile = '/content/drive/My Drive/My_Software_Projects/Output_File/CaptionTextFile.txt'

# initialise path file
outputTextFile = open(pathToOutputTxtFile, 'w')

# set image size
image_size = 384

#Loading the model file
model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth'



model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base')
model.eval()
model = model.to(device)
print("Model Successfully Loaded.")

print("Loading Input Video File.....")
inputVideo = cv2.VideoCapture(pathToInputVideoFile)
print("Successfully Loaded input File")


# Calculate the Frames per second (FPS)
print("Calculating Frames Per Second...")
fps = round(inputVideo.get(cv2.CAP_PROP_FPS))
print('Fps = ' + str(fps))

frameNumber = 0
timeStamp = 0

print('Processing Frames...')


while True:
  # Processing Frames
  success, imageFrame = inputVideo.read()

  if success:
    # increase the frame by 1
    frameNumber += 1

    image = load_demo_image(image_size=image_size, device=device, raw_image=imageFrame)

    with torch.no_grad():
      # ipdb.set_trace(context=6)
      # beam search (not working atm)
      # caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5)

      # nucleus sampling
      caption = model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5)
      print('caption: '+caption[0])

      # outputFrameFilePath = pathToOutputFramesFolder + 'time' + str(timeStamp) + '_' + 'frame' + str(frameNumber) + '.jpg'

      outputFrameFilePath = pathToOutputFramesFolder + str(caption[0]) + '.jpg'

      # write the frame
      cv2.imwrite(outputFrameFilePath, imageFrame)

      # write the captions
      outputTextFile.write("TimeStamp = " + str(timeStamp) +  " Frame = " + str(frameNumber) + " caption : " + str(caption[0]) + '\n' )

      print('Time = ' + str(timeStamp) + ' secs Frame = ' + str(frameNumber) + ' saved successfully')

  else:

    break

  # every 24 Frames increase the time by 1 and reset the frames to 0
  if frameNumber == fps:

    timeStamp += 1
    frameNumber = 0

print("Frame successfully Processed.")
inputVideo.release()