# Summary

This is a Zero Shot Classifier for whisper.
The goal is to feed environmental or background sounds as classes to the whisper model and retrieve the probabilities of each class.
An classes.text file holds the list of classes each in a square bracket.
Additionalal classess can be added to the txt file to increase robustness.
Although not integrated, whisper was to trained to support zero-shot classification (according the the whisper research paper.
A very helpful repo https://github.com/jumon/zac first mentioned using zero shot for whisper.

In [1]:
#import libraries
from typing import List, Optional
import torch
import torch.nn.functional as F
from whisper.audio import N_FRAMES, N_MELS, log_mel_spectrogram, pad_or_trim
from whisper.model import Whisper
from whisper.tokenizer import Tokenizer

# Processing the Audio

In [2]:
@torch.no_grad()
def calculate_audio_features(audio_path: Optional[str], model: Whisper) -> torch.Tensor:
    if audio_path is None:
        segment = torch.zeros((N_MELS, N_FRAMES), dtype=torch.float32).to(model.device)
    else:
        mel = log_mel_spectrogram(audio_path)
        segment = pad_or_trim(mel, N_FRAMES).to(model.device)
    return model.embed_audio(segment.unsqueeze(0))


# Calculating the log probabilities

In [3]:

@torch.no_grad()
def calculate_average_logprobs(
    model: Whisper,
    audio_features: torch.Tensor,
    class_names: List[str],
    tokenizer: Tokenizer,
) -> torch.Tensor:
    initial_tokens = (
        torch.tensor(tokenizer.sot_sequence_including_notimestamps).unsqueeze(0).to(model.device)
    )
    eot_token = torch.tensor([tokenizer.eot]).unsqueeze(0).to(model.device)

    average_logprobs = torch.zeros(len(class_names))
    for i, class_name in enumerate(class_names):
        class_name_tokens = (
            torch.tensor(tokenizer.encode(" " + class_name)).unsqueeze(0).to(model.device)
        )
        input_tokens = torch.cat([initial_tokens, class_name_tokens, eot_token], dim=1)

        logits = model.logits(input_tokens, audio_features)  # (1, T, V)
        logprobs = F.log_softmax(logits, dim=-1).squeeze(0)  # (T, V)
        logprobs = logprobs[len(tokenizer.sot_sequence_including_notimestamps) - 1 : -1]  # (T', V)
        logprobs = torch.gather(logprobs, dim=-1, index=class_name_tokens.view(-1, 1))  # (T', 1)
        average_logprob = logprobs.mean().item()
        average_logprobs[i] = average_logprob

    return average_logprobs


def calculate_internal_lm_average_logprobs(
    model: Whisper,
    class_names: List[str],
    tokenizer: Tokenizer,
    verbose: bool = False,
) -> torch.Tensor:
    audio_features_from_empty_input = calculate_audio_features(None, model)
    average_logprobs = calculate_average_logprobs(
        model=model,
        audio_features=audio_features_from_empty_input,
        class_names=class_names,
        tokenizer=tokenizer,
    )
    if verbose:
        print("Internal LM average log probabilities for each class:")
        for i, class_name in enumerate(class_names):
            print(f"  {class_name}: {average_logprobs[i]:.3f}")
    return average_logprobs

In [4]:
from typing import Dict
import gradio as gr
import whisper
from whisper.tokenizer import get_tokenizer

# Zero Shot Classifier

In [5]:
model_cache = {}


def zero_shot_classify(audio_path: str, class_names: str, model_name: str) -> Dict[str, float]:
    class_names = class_names.split(",")
    tokenizer = get_tokenizer(multilingual=".en" not in model_name)

    if model_name not in model_cache:
        model = whisper.load_model(model_name)
        model_cache[model_name] = model
    else:
        model = model_cache[model_name]

    internal_lm_average_logprobs = calculate_internal_lm_average_logprobs(
        model=model,
        class_names=class_names,
        tokenizer=tokenizer,
    )
    audio_features = calculate_audio_features(audio_path, model)
    average_logprobs = calculate_average_logprobs(
        model=model,
        audio_features=audio_features,
        class_names=class_names,
        tokenizer=tokenizer,
    )
    average_logprobs -= internal_lm_average_logprobs
    scores = average_logprobs.softmax(-1).tolist()
    return {class_name: score for class_name, score in zip(class_names, scores)}




# Processing the Output in English

In [6]:
#Importing necessary modules
import os
import sys
import subprocess
import numpy as np

In [7]:
#Extract mp3 from any video
def video2mp3(video_file, output_ext="mp3"):
    filename, ext = os.path.splitext(video_file)
    subprocess.call(["ffmpeg", "-y", "-i", video_file, f"{filename}.{output_ext}"], 
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)
    return f"{filename}.{output_ext}"

In [8]:
#Video file
input_video = "sahsiyet_trim_1.mp4"

In [9]:
#Extracting the mp3 file
audio_file = video2mp3(input_video)

In [10]:
#Reading the classnames. additional classes can be added to the class_names.txt file
def extract_class_name():
    with open('class_names.txt') as topo_file:
        class_names = ""
        for line in topo_file:
            class_names += line + ","
    return class_names

class_names_ = extract_class_name()

In [11]:
#Inputs for the classifier 
audio_path = audio_file #an audio of a laughter
model_name = "base"

In [12]:
classes_prob = zero_shot_classify(audio_path,class_names_,model_name)

In [13]:
#Scaling the class probabilities for better visualization
for key in classes_prob:
    #scaling the class probabilities and rounding to 2 decimal places
    classes_prob[key] = round(classes_prob[key]*10, 2)
    
    #this will remove the empty string generated by the output
    if key == '':
        classes_prob[key] = 0
    #sorting by highest probability
    dict(sorted(classes_prob.items(), key=lambda item: item[1]))
print(dict(sorted(classes_prob.items(), key=lambda item: item[1], reverse=True)))

{'[Gun shot]': 0.97, '[can_opening]\n': 0.4, '[glass_breaking]\n': 0.38, '[thunderstorm]\n': 0.34, '[toilet_flush]\n': 0.34, '[door_wood_creaks]\n': 0.29, '[drinking_sipping]\n': 0.26, '[helicopter whirring]\n': 0.25, '[car_horn]\n': 0.24, '[helicopter]\n': 0.24, '[door_wood_knock]\n': 0.24, '[pouring_water]\n': 0.22, '[crackling_fire]\n': 0.2, '[fireworks]\n': 0.19, '[vacuum_cleaner]\n': 0.19, '[water_drops]\n': 0.18, '[brushing_teeth]\n': 0.18, '[hand_saw]\n': 0.18, '[clapping]\n': 0.17, '[washing_machine]\n': 0.17, '[sneezing]\n': 0.16, '[siren]\n': 0.16, '[snoring]\n': 0.16, '[coughing]\n': 0.16, '[laughing]\n': 0.15, '[chirping_birds]\n': 0.15, '[sea_waves]\n': 0.14, '[clock_alarm]\n': 0.13, '[engine]\n': 0.12, '[sheep]\n': 0.12, '[chainsaw]\n': 0.12, '[clock_tick]\n': 0.12, '[clock ticking]\n': 0.12, '[insects]\n': 0.11, '[airplane]\n': 0.11, '[crickets]\n': 0.11, '[church_bells]\n': 0.11, '[footsteps]\n': 0.11, '[dog barking]\n': 0.11, '[cow]\n': 0.1, '[keyboard_typing]\n': 0.1,

In [14]:
#select the class with the maximum probability
background_noise = max(classes_prob, key= lambda x: classes_prob[x])
background_noise

'[Gun shot]'

In [15]:
#removing the opening and closing square brackets
background_noise = background_noise[1:-1]

In [16]:
#Processing the subtitle file (start and end time will be generated automatically in future update)
index = 1
start_time = "00:00:01,310"  
end_time = "00:00:04,000"
seperator = '-->'
time = start_time + " " + seperator + " "+end_time
background_noise = background_noise
subtitle = ["1 \n", time+"\n", background_noise+"\n"]

In [17]:
#Open subtitle file, it will create if it doesnt exist
srt_file = open("srt_file.srt", "w")
#add the subtitles line by line
srt_file.writelines(subtitle)
srt_file.close()

In [18]:
#Chnaging the file access mode from write to read so that i can print and see the output
srt_file = open("srt_file.srt", "r+")
print(srt_file.read())

1 
00:00:01,310 --> 00:00:04,000
Gun shot



In [19]:
import subprocess
#it is used to run external programs even from different languages like c and c++. in our case to run an ffmpeg command line command

In [20]:
#Add the subttile to the video
command = [
    "ffmpeg",
    "-i", input_video, #input video
    "-vf", "subtitles=srt_file.srt:force_style='Fontsize=24,Outline=1'", #add the subtitle file here
    "-c:v", "libx264", #hard subtitle
    "-c:a", "copy",
    "output enlgish.mp4" #output video
]

subprocess.run(command)

CompletedProcess(args=['ffmpeg', '-i', 'sahsiyet_trim_1.mp4', '-vf', "subtitles=srt_file.srt:force_style='Fontsize=24,Outline=1'", '-c:v', 'libx264', '-c:a', 'copy', 'output enlgish.mp4'], returncode=0)

In [21]:
from google.cloud import translate_v2 as translate

# Turkish background noise
Processing the output in turkish

Method used: Translating the English background noise output  to Turkish

In [22]:
import googletrans

In [23]:
print(googletrans.__version__) #currently this is the working version, pip install googletrans==4.0.0rc1

4.0.0-rc.1


In [24]:
from googletrans import Translator

In [25]:
translator = Translator()

In [41]:
translated = translator.translate(background_noise, dest='turkish')

In [42]:
background_noise_turk = translated.text

In [43]:
print(background_noise_turk)

Silah atışı


In [44]:
#turkish subtitle (time paragms is indiccated during english subtitle)
subtitle2 = ["1 \n", time+"\n", background_noise_turk+"\n"]

In [52]:
#Creating a subtitle file in Turkish
#Open subtitle file, it will create if it doesnt exist
srt_file_turk = open("srt_file_turk.srt", "w", encoding="utf-8")
#add the subtitles line by line
srt_file_turk.writelines(subtitle2)
srt_file_turk.close()

In [53]:
#Chnaging the file access mode from write to read so that i can print and see the output
srt_file = open("srt_file_turk.srt", "r+")
print(srt_file.read())

1 
00:00:01,310 --> 00:00:04,000
Silah atÄ±ÅŸÄ±



In [56]:
video = "test.mp4"
import subprocess
import ffmpeg

In [58]:
#Add the subttile to the video
command = [
    "ffmpeg",
    "-i", video, #input video
    "-vf", "subtitles=sahiyeset (1).srt:force_style='Fontsize=24,Outline=1'", #add the subtitle file here
    "-c:v", "libx264", #hard subtitle, meaning subtitle cannot be seperated from the video afterwards, soft subtitle means the opposite
    "-c:a", "copy",
    "output 5.mp4" #output video
]

subprocess.run(command)

CompletedProcess(args=['ffmpeg', '-i', 'test.mp4', '-vf', "subtitles=sahiyeset (1).srt:force_style='Fontsize=24,Outline=1'", '-c:v', 'libx264', '-c:a', 'copy', 'output 5.mp4'], returncode=1)

# Next Steps
1. Generating the background noise for Audio Segments 
2. Automatic timestamps
3. Combining it with a background noise with subtitle generator
4. Adding more classess to the data
5. Generating the log probability for each class independently to improve accuracy
6. Re-training to improve speed and accuracy.