In [39]:
import whisper
import moviepy.editor as mp
import pandas as pd
import os
from googletrans import Translator

In [40]:
video_folder = 'videos'
files = sorted(os.listdir(video_folder))

In [41]:
if not os.path.exists('video_processed.csv'):
    df = pd.DataFrame(columns={'Content ID': [], 'English Transcription': [], 'Original Language Transcription': []})
    df.to_csv('video_processed.csv')
df = pd.read_csv('video_processed.csv')

In [42]:
translator = Translator()
model = whisper.load_model('medium')

In [43]:
def format_data(english_result, original_result, video_id):
    '''
    THIS WILL CHANGE TO FORMAT INTO CSV FORMAT
    '''
    output = {}

    # build dictionary
    output['Content ID'] = video_id
    output['English Transcription'] = english_result
    output['Original Language Transcription'] = original_result
    return output

In [44]:

def process_videos(file):
    csv_list = []
    file_path = f"{video_folder}/{file}"
    video_file = file_path
    audio_file = os.path.basename(file_path)
    audio_file = f"{os.path.splitext(audio_file)[0]}.wav"
    
    # load video
    
    clip = mp.VideoFileClip(video_file)
    if clip.audio is not None:
        clip.audio.write_audiofile(audio_file, logger=None)

    # detect language
    audio = whisper.load_audio(audio_file)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    _, probs = model.detect_language(mel)
    language = max(probs, key=probs.get)
    print(language)
    
    # if language confidence is low, skip transcription
    if probs.get(language) < 0.1:
        print("language confidence too low, quitting...")
        original_result = None
        english_result = None

    # if language not english, translate to english
    if language != 'en':
        print('language not english, transcribing original language...')
        original_result = model.transcribe(audio_file, verbose=True)
        print('translating to english...')
        english_result = ""
        for segment in original_result['segments']:
            english = translator.translate(segment['text'])
            english = english.text
            english_result += english

        original_result = original_result['text']
    else:
        print('language is english, transcribing...')
        #english_result = model.transcribe(audio_file, beam_size=5, best_of=5, verbose=True)
        english_result = model.transcribe(audio_file, verbose=True)
        english_result = english_result['text']
        original_result = None

    # format data for web
    output_data = format_data(english_result, original_result, os.path.splitext(file)[0])
    
    # remove audio file
    os.remove(audio_file)
    print(file_path, 'completed')

    csv_list.append(output_data)
    
    return csv_list

In [None]:
# stop this cell to quit processing
for file in files:
    if os.path.splitext(file)[0] in list(df['Content ID']):
        continue
    pd.DataFrame(process_videos(file)).to_csv('video_processed.csv', mode='a', header=False)