In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


## Video transcription
@forusone (shins777@gmail.com)

### Video transcription and translation

This colab used YT videos to test Gemini's transciption and translation.

### Install Vertex AI SDK for Python

In [1]:
!pip install --upgrade --quiet \
    "google-cloud-aiplatform[langchain,reasoningengine]" \
    google-cloud-speech \
    google-cloud-storage \
    google-cloud-videointelligence

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.4/149.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.2/324.2 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.8/269.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.6/100.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.4/90.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.9/434.9 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id="ai-hangsik")

!gcloud config set project ai-hangsik


Updated property [core/project].


### Initial set up

In [3]:
PROJECT_ID = "ai-hangsik"
LOCATION = "us-central1"
MODEL = "gemini-1.5-pro-002"
#MODEL = "gemini-pro-experimental"

import base64
import vertexai
from vertexai.generative_models import GenerativeModel

vertexai.init(project=PROJECT_ID, location=LOCATION)
model = GenerativeModel(MODEL)

## Transcript and translation

### Helper functions

In [4]:

def generate(prompt:str,
             video_uri:str,
             response_schema)->str:
  """
  Function to call Gemini for Transcription and Translation.

  Args:
    prompt: Prompt to instruct transcription and translation.
    yt_uri: Youtube video URI to analyze.

  Returns:
    A transcription or translation with specified response schema.(JSON)

  """

  from vertexai.generative_models import (
        Part,
        SafetySetting,
        GenerationConfig
  )

  # Configuration to contorl LLM generation.
  generation_config = GenerationConfig(
      max_output_tokens = 8192,
      temperature=1,
      top_p =0.95,
      response_mime_type="application/json",
      response_schema=response_schema
  )

  # Sate setting
  safety_settings = [
      SafetySetting(
          category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
          threshold=SafetySetting.HarmBlockThreshold.OFF
      ),
      SafetySetting(
          category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
          threshold=SafetySetting.HarmBlockThreshold.OFF
      ),
      SafetySetting(
          category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
          threshold=SafetySetting.HarmBlockThreshold.OFF
      ),
      SafetySetting(
          category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
          threshold=SafetySetting.HarmBlockThreshold.OFF
      ),
  ]

  video1 = Part.from_uri(
      mime_type="video/*",
      uri=video_uri,
  )

  responses = model.generate_content(
      [prompt, video1],
      generation_config=generation_config,
      safety_settings=safety_settings,
      stream=False,
  )

  return responses.text



### YT Videos to transcribe and translate

In [32]:

if True: # Videos in YT platform

  # video_uri="https://www.youtube.com/watch?v=PNTCM7cbrsc" # Small Talk
  # video_uri="https://www.youtube.com/watch?v=wrh-4GZN3aE" # Dad Won't Let Daughter Marry For Love
  # video_uri = "https://www.youtube.com/watch?v=CYyUuIXzGgI" # Game Theory Scene | 21(2008)
  # video_uri = "https://www.youtube.com/watch?v=OoUVSHDbAeM" # Stephen Hawking Discovers The Black Hole Theory
  video_uri = "https://www.youtube.com/watch?v=Gth02sjU4wU" # [단편영화] A (2017)
  video_uri = "https://www.youtube.com/watch?v=lSeBy_lqs28" # 이 뭔 개소리야 긴버전 (원본영상)
  video_uri = "https://www.youtube.com/watch?v=IuiCRoz2gMY" # 누구인가? 누가 기침 소리를 내었는가 -궁예-
else: # videos in GCS
  # video_uri="gs://tests_nov25_2024/translation/small_talk.mp4"  # Small Talk
  video_uri="gs://tests_nov25_2024/translation/Stephen Hawking Discovers The Black Hole Theory | The Theory Of Everything (2014) | Screen Bites.mp4" # Stephen Hawking Discovers The Black Hole Theory




### Video transcription

In [33]:
import json

prompt_transcript = """
당신을 비디오를 분석해서 transcript를 작성해야 하는 AI Assistant 입니다.
아래 가이드라인에 맞게 transcription을 작성해주세요.

1. 첨부된 비디오를 분석하여 아래와 같은 포맷으로 반드시 모든 대화 내용을 하나도 빠짐없이 출력해주세요.
2. 결과 출력 단위는 비디오 내의 장면이 구분되는 특정 장소를 기준으로 나누어서 출력해주세요.
3. 목소리를 기반으로 화자(speaker)를 정확하게 분리해서 출력해주세요.
4. 목소리외에 다양한 효과음, 감정표현은 괄호를 사용해서 반드시 자세히 표현해주세요.
5. 반드시 연결된 문장은 하나로 표현해주세요.

"""

# Response schema can control the output of generation from Gemini.
response_schema = {
    "type": "ARRAY",
    "items": {
        "type": "OBJECT",
        "properties": {
            "location": { "type": "STRING",},
            "start_time": { "type": "STRING",},
            "end_time": { "type": "STRING",},
            "elapsed_time": { "type": "STRING",},
            "transcription": {
                "type": "ARRAY",
                "items" : {
                  "type": "OBJECT",
                  "properties": {
                    "speaker": { "type": "STRING",},
                    "transcript": { "type": "STRING",},
                  }
                }
            },
        },
        "required": ["start_time","end_time","transcription"],
    },
}

json_str = generate(prompt_transcript, video_uri, response_schema)
json_transcript = json.loads(json_str)


In [34]:
for transcription in json_transcript:
  print(transcription)
  print("-"*20)

{'location': '궁궐 안', 'start_time': '00:00', 'end_time': '00:03', 'elapsed_time': '3초', 'transcription': [{'speaker': '황제', 'transcript': '누구인가? 지금 누가 기침 소리를 내었어?'}]}
--------------------
{'location': '궁궐 안', 'start_time': '00:04', 'end_time': '00:08', 'elapsed_time': '4초', 'transcription': [{'speaker': '신하', 'transcript': '(기침) 신이옵니다.폐하'}, {'speaker': '신하', 'transcript': '(기침)'}]}
--------------------
{'location': '궁궐 안', 'start_time': '00:09', 'end_time': '00:20', 'elapsed_time': '11초', 'transcription': [{'speaker': '황제', 'transcript': '참으로 딱하구나. 내가 가만히 보니 네놈 머릿속에는 마구니가 가득 찼구나. 여봐라 내군은 들어라'}]}
--------------------
{'location': '궁궐 안', 'start_time': '00:21', 'end_time': '00:28', 'elapsed_time': '7초', 'transcription': [{'speaker': '내군들', 'transcript': '예'}, {'speaker': '황제', 'transcript': '저자의 머릿속에는 마구니가 가득하다. 그 마구니를 때려 죽여라!'}]}
--------------------
{'location': '궁궐 안', 'start_time': '00:29', 'end_time': '00:33', 'elapsed_time': '4초', 'transcription': [{'speaker': '신하1', 'transcript': '

### Video Translation

In [35]:

prompt_translation = f"""
당신을 원본 언어로 된 문서를 기반으로 비디오를 분석해서 영어로 번역을 해야 하는 AI Assistant 입니다.
반드시 비디오의 모든 장면에 나타난 모든 대화 내용을 아래 가이드라인에 맞게 정확하게 번역 해주세요.

1. 번역을 할 때는 첨부된 json 내에서 transcription 부분만 해주세요.
2. 번역결과는 할 때는 json구조를 그대로 유지해서 표현해주세요.
3. 목소리외에 다양한 효과음, 감정표현은 괄호를 사용해서 반드시 자세히 표현해주세요.
4. 반드시 연결된 문장은 하나로 표현해주세요.

# 원본 문서 : {json_transcript}

"""

json_str = generate(prompt_translation,video_uri, response_schema) # Used same response schema with transcription process.
json_translation = json.loads(json_str)


In [36]:
for transcription, translation in zip( json_transcript, json_translation):
  print(transcription)
  print(translation)
  print("-"*20)


{'location': '궁궐 안', 'start_time': '00:00', 'end_time': '00:03', 'elapsed_time': '3초', 'transcription': [{'speaker': '황제', 'transcript': '누구인가? 지금 누가 기침 소리를 내었어?'}]}
{'location': 'Inside the Palace', 'start_time': '00:00', 'end_time': '00:03', 'elapsed_time': '3 seconds', 'transcription': [{'speaker': 'Emperor', 'transcript': 'Who is it? Who just coughed?'}]}
--------------------
{'location': '궁궐 안', 'start_time': '00:04', 'end_time': '00:08', 'elapsed_time': '4초', 'transcription': [{'speaker': '신하', 'transcript': '(기침) 신이옵니다.폐하'}, {'speaker': '신하', 'transcript': '(기침)'}]}
{'location': 'Inside the Palace', 'start_time': '00:04', 'end_time': '00:08', 'elapsed_time': '4 seconds', 'transcription': [{'speaker': 'Subject', 'transcript': '(Coughing) It is I, your Majesty. (Coughing)'}]}
--------------------
{'location': '궁궐 안', 'start_time': '00:09', 'end_time': '00:20', 'elapsed_time': '11초', 'transcription': [{'speaker': '황제', 'transcript': '참으로 딱하구나. 내가 가만히 보니 네놈 머릿속에는 마구니가 가득 찼구나. 여봐라 내군

### Compare transcript and translation

In [37]:
from IPython.display import display, Markdown
import pandas as pd

df = pd.DataFrame()

for transcript, translation in zip( json_transcript, json_translation):

  for script, trans in zip(transcript['transcription'], translation['transcription']):
    script_df = pd.DataFrame.from_dict(script, orient='index').T
    trans_df = pd.DataFrame.from_dict(trans, orient='index').T

    script_df['translation'] = trans_df['transcript']

    df = pd.concat([df, script_df], ignore_index=True)  # Concatenate script_df to df

df

Unnamed: 0,speaker,transcript,translation
0,황제,누구인가? 지금 누가 기침 소리를 내었어?,Who is it? Who just coughed?
1,신하,(기침) 신이옵니다.폐하,"(Coughing) It is I, your Majesty. (Coughing)"
2,황제,참으로 딱하구나. 내가 가만히 보니 네놈 머릿속에는 마구니가 가득 찼구나. 여봐라 ...,How very unfortunate. I see your head is full ...
3,내군들,예,Yes!
4,황제,저자의 머릿속에는 마구니가 가득하다. 그 마구니를 때려 죽여라!,That man's head is full of demons. Beat those ...
5,신하1,(움찔),(Flinching)
6,신하2,폐하,Your Majesty.
7,황제,연부자은 뭘 하는가? 저자를 쳐라!,"Commander Yeon, what are you doing? Hit that man!"
8,근부장,폐하,Your Majesty!
9,황제,저자를 죽이라고 하였느니라! 저놈은 마구니다. 저놈을 어서 처 죽여라! 저놈을! 근...,I told you to kill him! That man is a demon. K...


## Validation translation

### Helper function for validation

### Prompt to recheck the previous translation

In [38]:
prompt_check = f"""
당신을 비디오를 분석해서 기존에 번역된 문서(translated document)를 참조하여 한번 더 정확하게 영어로 번역을 해야 하는 AI Assistant 입니다.
반드시 비디오의 모든 장면에 나타난 모든 대화 내용을 아래 가이드라인에 맞게 정확하게 번역 해주세요.

1. 번역을 할 때는 첨부된 json 내에서 transcription 부분만 해주세요.
2. 목소리외에 다양한 효과음, 감정표현은 괄호를 사용해서 반드시 자세히 표현해주세요.
3. 기존 번역내용은 'original_translation' 에 표현해주고, 변경된 내용은 'checked_translation' 에 표현해주세요.

translated document : {json_translation}

"""

# Response schema can control the output of generation from Gemini.
response_schema = {
    "type": "ARRAY",
    "items": {
        "type": "OBJECT",
        "properties": {
            "location": { "type": "STRING",},
            "start_time": { "type": "STRING",},
            "end_time": { "type": "STRING",},
            "elapsed_time": { "type": "STRING",},
            "transcription": {
                "type": "ARRAY",
                "items" : {
                  "type": "OBJECT",
                  "properties": {
                    "speaker": { "type": "STRING",},
                    "original_translation": { "type": "STRING",},  # original translation
                    "checked_translation": { "type": "STRING",},   # recheck translation
                  }
                }
            },
        },
        "required": ["start_time","end_time","elapsed_time"],
    },
}

json_checked_translation_str = generate(prompt_check, video_uri, response_schema)
json_checked_translation = json.loads(json_checked_translation_str)


In [39]:
for checked_translation in json_checked_translation:
  print(checked_translation)
  print("-"*20)


{'location': 'Inside the Palace', 'start_time': '00:00', 'end_time': '00:03', 'elapsed_time': '3 seconds', 'transcription': [{'speaker': 'Emperor', 'original_translation': 'Who is it? Who just coughed?', 'checked_translation': 'Who is it? Who just coughed?'}]}
--------------------
{'location': 'Inside the Palace', 'start_time': '00:04', 'end_time': '00:08', 'elapsed_time': '4 seconds', 'transcription': [{'speaker': 'Subject', 'original_translation': '(Coughing) It is I, your Majesty. (Coughing)', 'checked_translation': '(Coughing) It is I, your Majesty. (Coughing)'}]}
--------------------
{'location': 'Inside the Palace', 'start_time': '00:09', 'end_time': '00:20', 'elapsed_time': '11 seconds', 'transcription': [{'speaker': 'Emperor', 'original_translation': 'How very unfortunate. I see your head is full of demons. Guards, listen to me!', 'checked_translation': 'How unfortunate. I see your head is full of demons. Guards, listen!'}]}
--------------------
{'location': 'Inside the Palace'

In [40]:
from IPython.display import display, Markdown
import pandas as pd

df2 = pd.DataFrame()

for transcript, translation in zip( json_transcript, json_checked_translation):

  for script, trans in zip(transcript['transcription'], translation['transcription']):
    script_df = pd.DataFrame.from_dict(script, orient='index').T
    trans_df = pd.DataFrame.from_dict(trans, orient='index').T

    script_df['org_translation'] = trans_df['original_translation']
    script_df['chk_translation'] = trans_df['checked_translation']

    df2 = pd.concat([df2, script_df], ignore_index=True)


df2

Unnamed: 0,speaker,transcript,org_translation,chk_translation
0,황제,누구인가? 지금 누가 기침 소리를 내었어?,Who is it? Who just coughed?,Who is it? Who just coughed?
1,신하,(기침) 신이옵니다.폐하,"(Coughing) It is I, your Majesty. (Coughing)","(Coughing) It is I, your Majesty. (Coughing)"
2,황제,참으로 딱하구나. 내가 가만히 보니 네놈 머릿속에는 마구니가 가득 찼구나. 여봐라 ...,How very unfortunate. I see your head is full ...,How unfortunate. I see your head is full of de...
3,내군들,예,Yes!,Yes!
4,황제,저자의 머릿속에는 마구니가 가득하다. 그 마구니를 때려 죽여라!,That man's head is full of demons. Beat those ...,That man's head is full of demons. Beat those ...
5,신하1,(움찔),(Flinching),(Flinching)
6,신하2,폐하,Your Majesty.,Your Majesty.
7,황제,연부자은 뭘 하는가? 저자를 쳐라!,"Commander Yeon, what are you doing? Hit that man!","Commander Yeon, what are you doing? Hit that man!"
8,근부장,폐하,Your Majesty!,Your Majesty!
9,황제,저자를 죽이라고 하였느니라! 저놈은 마구니다. 저놈을 어서 처 죽여라! 저놈을! 근...,I told you to kill him! That man is a demon. K...,I told you to kill him! That man is a demon. K...
