---


###***Initialisation cells (Run these cells to download the necessary libraries and mount the drive)***


---

In [None]:
# code to mount my drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/My_Software_Projects/lens
!pip install -Uqq ipdb
import ipdb

In [None]:
%pdb on

In [None]:
# install the necessary requirements

!pip install Pillow datasets transformers llm-lens torch
!pip install SpeechRecognition
!pip install moviepy

---


###***Util codes that need to be run before executing the codes in the main section***


---

#### Code to load images and preprocess them

In [None]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def loadImage(device, rawImage):

    # imgUrl = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
    # rawImage = Image.open(requests.get(imgUrl, stream=True).raw).convert('RGB')
    # imgUrl = '/content/drive/My Drive/My_Software_Projects/Input_Frames/time0_frame1.jpg'
    # rawImage = Image.open(imgUrl).convert('RGB')
    # rawImage = cv2.imread(imgUrl)
    rawImage = rawImage
    rawImage = cv2.cvtColor(rawImage, cv2.COLOR_BGR2RGB)
    rawImage = Image.fromarray(rawImage)


    w,h = rawImage.size
    display(rawImage.resize((w//5,h//5)))
    #cv2.imshow('image', raw_image)

    #transform = transforms.Compose([
    #    transforms.Resize((imageSize,imageSize),interpolation=InterpolationMode.BICUBIC),
    #    transforms.ToTensor(),
    #    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    #    ])
    #image = transform(rawImage).unsqueeze(0).to(device)

    return rawImage

#### Make directories utli function

In [None]:
import os

def makeDirectory(path):

    try:

        os.makedirs(path, exist_ok=True)
        print("Directory '%s' created successfully" % path)

    except OSError as error:

        print("Directory already exist")
        pass

#### Util function to write CSV File

In [None]:
import csv

def csvWriteRow(pathToCSVFile, rowData):

  """
  Util function to write a CSV File

  pathToCSVFile: path to the CSV File
  rowData: Array data to be written

  """

  # write the csv File
  with open(pathToCSVFile, 'a', newline='') as testWriteCSV:

    csvWriter = csv.writer(testWriteCSV)

    try:

      csvWriter.writerow(rowData)

    except UnicodeEncodeError as error:

      pass

#### Function to get the audio to Text from a video file

In [None]:
import speech_recognition as sr
import moviepy.editor as mp
from moviepy.editor import VideoFileClip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip


def videoToText(pathToVideo):
  """
  Util Function that converts video to text using the Google Speech to text API

  :param pathToVideo : Path to the video file
  :return: List containing the audiototext per second
  """

  print ("Converting speech to Text from Video..")
  clip = VideoFileClip(pathToVideo)

  numSecondsVideo = int(clip.duration)
  print("The video is {} seconds".format(numSecondsVideo))
  lenList = list(range(0, numSecondsVideo + 1, 1))

  resultDict = {}
  for i in range(len(lenList) - 1):

    print("time = " + str(i))

    # clipping video into chunks of small sections to run through the speech APi
    ffmpeg_extract_subclip(pathToVideo, lenList[i] - 2 * (lenList[i] != 0), lenList[i + 1],
                           targetname="/content/drive/My Drive/My_Software_Projects/Intermediate_Folder/cut{}.mp4".format(i + 1))

    clip = mp.VideoFileClip(r"/content/drive/My Drive/My_Software_Projects/Intermediate_Folder/cut{}.mp4".format(i + 1))

    # separating the audio from video
    clip.audio.write_audiofile(r"/content/drive/My Drive/My_Software_Projects/Intermediate_Folder/converted{}.wav".format(i + 1))

    r = sr.Recognizer()
    audio = sr.AudioFile("/content/drive/My Drive/My_Software_Projects/Intermediate_Folder/converted{}.wav".format(i + 1))

    with audio as source:
      r.adjust_for_ambient_noise(source)
      audioFile = r.record(source)

    try:

      # feeding the sudio to the google speech to text API
      result = r.recognize_google(audioFile)

    except sr.exceptions.UnknownValueError:

      # store as no exception if the audio has not enough data to convert to text
      result = 'No transcript'
    resultDict['chunk{}'.format(i + 1)] = result

  listText = [resultDict['chunk{}'.format(i + 1)] for i in range(len(resultDict))]

  return listText

---


### ***Main code Section to run inference of LENS on various modes and also dataset generation. (All the code cells above need to executed before the execution of the cells below)***


---

#### Code to run an inference on the Lens image Visual descriptions network from a video file (mode = all)

In [None]:
from PIL import Image
import requests
import torch
from lens import Lens, LensProcessor
import cv2
import time
import csv

# question to be Asked (Text Prompt)
question = 'What is the facial expression of the person?'

# csv object to be written to the csv File
# csvWrite = ['Timestamp', 'FrameID', 'FileName', 'CaptionedText', 'QAResult', 'AudioToText']
csvWrite = ['Timestamp', 'FrameID', 'FileName', 'Tags', 'Attributes', 'Caption', 'IntensiveCaptions', 'Prompts']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
currentTime = time.strftime("%H_%M_%S")

#path to input Video File
pathToInputVideoFile = '/content/drive/My Drive/My_Software_Projects/Input_Video/InputVideo2.mp4'

#path to the output Folder
pathToOutputFramesFolder = '/content/drive/My Drive/My_Software_Projects/Output/Output_Dataset_' + str(currentTime) + '/'
makeDirectory(pathToOutputFramesFolder)

# path to the output text File
pathToOutputTxtFile = '/content/drive/My Drive/My_Software_Projects/Output_File/CaptionTextFile_LENS_' + str(currentTime) + '.txt'
pathToOutputCSVFile = '/content/drive/My Drive/My_Software_Projects/Output_File/CaptionCSVFile_LENS_' + str(currentTime) + '.csv'

csvWriteRow(pathToCSVFile=pathToOutputCSVFile, rowData=csvWrite)


# Video to text conversion
# audioToTextList = videoToText(pathToVideo = pathToInputVideoFile)
# print ("Conversion from speech to text successful.")


print("Loading the Lens Model...")
lens = Lens()
processor = LensProcessor()
print("Lens Model Successfully Loaded.")

print("Loading Input Video File.....")
inputVideo = cv2.VideoCapture(pathToInputVideoFile)
print("Successfully Loaded input File")


# Calculate the Frames per second (FPS)
print("Calculating Frames Per Second...")
fps = round(inputVideo.get(cv2.CAP_PROP_FPS))
print('Fps = ' + str(fps))

frameNumber = 0
timeStamp = 0

print('Processing Frames...')


while True:
  # Processing Frames
  success, imageFrame = inputVideo.read()

  if success:
    # increase the frame by 1
    frameNumber += 1

    image = loadImage(device='cuda', rawImage=imageFrame)


    with torch.no_grad():

      samples = processor([image],[question])
      output = lens(samples)
      prompts = output["prompts"]
      tags = output["tags"]
      attributes = output["attributes"]
      caption = output["caption"]
      intensiveCaptions = output["intensive_captions"]
      #objects = output["objects"]
      #print('answer: '+str(prompts[0]))


      #outputFrameFilePath = pathToOutputFramesFolder + str(answer[0]) + str(timeStamp) + str(frameNumber) + '.jpg'
      outputFrameFileName = 'time_' + str(timeStamp) + 'frame_' + str(frameNumber) + '.jpg'
      outputFrameFilePath = pathToOutputFramesFolder + outputFrameFileName

      # write the frame
      cv2.imwrite(outputFrameFilePath, imageFrame)

      # write the captions
      # writingText = "TimeStamp = " + str(timeStamp) +  " Frame = " + str(frameNumber) + " Emotion : " + str(answer[0])
      # utputTextFile.write(writingText)

      # append a row to csvFile object
      # csvWrite = [str(timeStamp), str(frameNumber), outputFrameFileName, str(caption[0]), str(answer[0]), str(audioToTextList[timeStamp])]
      csvWrite = [str(timeStamp), str(frameNumber), outputFrameFileName, str(tags), str(attributes), str(caption), str(intensiveCaptions), str(prompts)]
      csvWriteRow(pathToCSVFile=pathToOutputCSVFile, rowData=csvWrite)

      #with open(pathToOutputTxtFile, 'a') as testwritefile:
        #testwritefile.write(writingText + '\n')

      # outputTextFile.write('\n')

      print('Time = ' + str(timeStamp) + ' secs Frame = ' + str(frameNumber) + ' saved successfully')

  else:

    break

  # every time you hit the last frame increase the time by 1 and reset the frames to 0
  if frameNumber == fps:

    timeStamp += 1
    frameNumber = 0

print("Frame successfully Processed.")
inputVideo.release()

# write the csv File
#with open(pathToOutputCSVFile, 'w', newline='') as testWriteCSV:
#  csvWriter = csv.writer(testWriteCSV)
#  csvWriter.writerows(csvWrite)

#### Code to run an inference on the Lens image Visual descriptions network and then feed into the Frozen LLM (mode = all)

In [None]:
from PIL import Image
import requests
import torch
from lens import Lens, LensProcessor
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import cv2
import time
import csv
import re

# question to be Asked (Text Prompt)
question = 'What is the emotion expressed?'

# csv object to be written to the csv File
# csvWrite = ['Timestamp', 'FrameID', 'FileName', 'CaptionedText', 'QAResult', 'AudioToText']
csvWrite = ['Timestamp', 'FrameID', 'FileName', 'Tags', 'Attributes', 'Caption', 'IntensiveCaptions', 'Prompts', 'LLM Output']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
currentTime = time.strftime("%H_%M_%S")

#path to input Video File
pathToInputVideoFile = '/content/drive/My Drive/My_Software_Projects/Input_Video/InputVideo2.mp4'

#path to the output Folder
pathToOutputFramesFolder = '/content/drive/My Drive/My_Software_Projects/Output/Output_Dataset_' + str(currentTime) + '/'
makeDirectory(pathToOutputFramesFolder)

# path to the output text File
pathToOutputTxtFile = '/content/drive/My Drive/My_Software_Projects/Output_File/CaptionTextFile_LENS_' + str(currentTime) + '.txt'
pathToOutputCSVFile = '/content/drive/My Drive/My_Software_Projects/Output_File/CaptionCSVFile_LENS_' + str(currentTime) + '.csv'

csvWriteRow(pathToCSVFile=pathToOutputCSVFile, rowData=csvWrite)

# regex to remove the <pad> and </s> from the output of the LLMs
remWord1 = re.compile('(\s*)<pad>(\s*)')
remWord2 = re.compile('(\s*)</s>(\s*)')


# Video to text conversion
# audioToTextList = videoToText(pathToVideo = pathToInputVideoFile)
# print ("Conversion from speech to text successful.")


print("Loading the Lens Model...")
lens = Lens()
processor = LensProcessor()
print("Lens Model Successfully Loaded.")

print("Loading Input Video File.....")
inputVideo = cv2.VideoCapture(pathToInputVideoFile)
print("Successfully Loaded input File")


# Calculate the Frames per second (FPS)
print("Calculating Frames Per Second...")
fps = round(inputVideo.get(cv2.CAP_PROP_FPS))
print('Fps = ' + str(fps))

frameNumber = 0
timeStamp = 0

print('Processing Frames...')


while True:
  # Processing Frames
  success, imageFrame = inputVideo.read()

  if success:
    # increase the frame by 1
    frameNumber += 1

    image = loadImage(device='cuda', rawImage=imageFrame)


    with torch.no_grad():

      # infering the initial vision models like BLIP, CLIP
      samples = processor([image],[question])
      output = lens(samples)

      # feeding the output of the vision models to a frozen LLM
      tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small",truncation_side = 'left',padding = True)
      LLMModel = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
      inputIds = tokenizer(samples["prompts"], return_tensors="pt").input_ids
      outputs = LLMModel.generate(inputIds)
      LLMoutput = str(tokenizer.decode(outputs[0]))
      LLMoutput = remWord1.sub('', LLMoutput)
      LLMoutput = remWord2.sub('', LLMoutput)

      print(LLMoutput)


      prompts = output["prompts"]
      tags = output["tags"]
      attributes = output["attributes"]
      caption = output["caption"]
      intensiveCaptions = output["intensive_captions"]



      #outputFrameFilePath = pathToOutputFramesFolder + str(answer[0]) + str(timeStamp) + str(frameNumber) + '.jpg'
      outputFrameFileName = 'time_' + str(timeStamp) + 'frame_' + str(frameNumber) + '.jpg'
      outputFrameFilePath = pathToOutputFramesFolder + outputFrameFileName

      # write the frame
      cv2.imwrite(outputFrameFilePath, imageFrame)

      # write the captions
      # writingText = "TimeStamp = " + str(timeStamp) +  " Frame = " + str(frameNumber) + " Emotion : " + str(answer[0])
      # utputTextFile.write(writingText)

      # append a row to csvFile object
      # csvWrite = [str(timeStamp), str(frameNumber), outputFrameFileName, str(caption[0]), str(answer[0]), str(audioToTextList[timeStamp])]
      csvWrite = [str(timeStamp), str(frameNumber), outputFrameFileName, str(tags), str(attributes), str(caption), str(intensiveCaptions), str(prompts), str(LLMoutput)]
      csvWriteRow(pathToCSVFile=pathToOutputCSVFile, rowData=csvWrite)

      print('Time = ' + str(timeStamp) + ' secs Frame = ' + str(frameNumber) + ' saved successfully')

  else:

    break

  # every time you hit the last frame increase the time by 1 and reset the frames to 0
  if frameNumber == fps:

    timeStamp += 1
    frameNumber = 0

print("Frame successfully Processed.")
inputVideo.release()

Output hidden; open in https://colab.research.google.com to view.