In [None]:
# Python code to convert video to audio
import moviepy.editor as mp
import torch
import librosa
import numpy as np
import soundfile as sf
from scipy.io import wavfile
from IPython.display import Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [None]:
# Insert Local Video File Path 
clip = mp.VideoFileClip(r"./obamaDeepfake.mp4")
  
# Insert Local Audio File Path
clip.audio.write_audiofile(r"./transcript.wav")

In [None]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

In [None]:
import wave
import json

from vosk import Model, KaldiRecognizer, SetLogLevel
import Word as custom_Word

model_path = "models/vosk-model-en-us-0.21"
audio_filename = "audio/speech_recognition_systems.wav"

model = Model(model_path)
wf = wave.open(audio_filename, "rb")
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)

# get the list of JSON dictionaries
results = []
# recognize speech using vosk model
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        part_result = json.loads(rec.Result())
        results.append(part_result)
part_result = json.loads(rec.FinalResult())
results.append(part_result)

# convert list of JSON dictionaries to list of 'Word' objects
list_of_Words = []
for sentence in results:
    if len(sentence) == 1:
        # sometimes there are bugs in recognition 
        # and it returns an empty dictionary
        # {'text': ''}
        continue
    for obj in sentence['result']:
        w = custom_Word.Word(obj)  # create custom Word object
        list_of_Words.append(w)  # and add it to list

wf.close()  # close audiofile

# output to the screen
for word in list_of_words:
    print(word.to_string())

In [None]:
import speech_recognition as sr

r = sr.Recognizer()

with sr.AudioFile('transcript.wav') as source:
    audio = r.record(source, duration=30)
    text = ""
    try:
        while True:
            text += r.recognize_google(audio)
            audio = r.record(source, duration=30)
    except sr.WaitTimeoutError:
        pass

with open('output.txt', 'w') as f:
    f.write(text)


In [None]:
file_name = 'transcript.wav'

data = wavfile.read(file_name)
framerate = data[0]
sounddata = data[1]
time = np.arange(0,len(sounddata))/framerate
input_audio, _ = librosa.load(file_name, sr=16000)
input_values = tokenizer(input_audio, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)[0]
print(transcription)


In [None]:
file_name = 'transcript.wav'
framerate = 16000
input_audio, _ = librosa.load(file_name, sr=framerate)

# Use a loop to process the audio file in segments, if it's too large to fit in memory
segment_length = 30  # in seconds
num_segments = int(np.ceil(len(input_audio) / (segment_length * framerate)))
transcription = ''
start_time = 0
for i in range(num_segments):
    start = i * segment_length * framerate
    end = (i + 1) * segment_length * framerate
    input_segment = input_audio[start:end]
    input_values = tokenizer(input_segment, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    segment_transcription = tokenizer.batch_decode(predicted_ids)[0]
    transcription += segment_transcription

    # Print the timestamp of every occurrence of the letter "M"
    for j, char in enumerate(segment_transcription):
        if char == 'M':
            print(f"Occurrence of M found at {start_time + j / framerate} seconds")
    start_time += segment_length

print(transcription)

In [24]:
import librosa
import numpy as np
import torch
import wave
import contextlib

filename = 'transcript.wav'

with contextlib.closing(wave.open(filename,'r')) as f:
    rate, audio_data = wavfile.read(filename)
    audio_data = audio_data.T
    audio_data = audio_data / np.max(np.abs(audio_data))  # normalize the audio data to the range [-1, 1]

    mfccs = librosa.feature.mfcc(y=audio_data, sr=rate, n_mfcc=13)

    model = torch.load("acoustic_model.pt")  # load a pre-trained acoustic model
    inputs = torch.tensor(mfccs.T).unsqueeze(0)  # add a batch dimension to the inputs
    outputs = model(inputs)  # run the inputs through the acoustic model
    _, predicted_phonemes = torch.max(outputs, dim=2)  # get the predicted phonemes

    for i, predicted_phoneme in enumerate(predicted_phonemes[0]):
        if predicted_phoneme == "M":  # check if the predicted phoneme is "M"
            time = i * (len(audio_data) / mfccs.shape[1]) / rate
            print("Phonetic sound of M found at time: {:.2f} seconds".format(time))


FileNotFoundError: [Errno 2] No such file or directory: 'acoustic_model.pt'