In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration,AutoTokenizer,AutoModelForCausalLM
from datasets import Audio, load_dataset
import torch
import librosa
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from time import time
from llama_cpp import Llama
import json

# load model and processor




In [None]:
with open('../config.json', "r") as json_file:
    config = json.load(json_file)

In [None]:
processor = WhisperProcessor.from_pretrained(config['s2t'])
model = WhisperForConditionalGeneration.from_pretrained(config['s2t'])
forced_decoder_ids = processor.get_decoder_prompt_ids(language="hindi", task="translate")


In [None]:
llm = Llama(model_path=config['q/a_model'])

In [None]:
model_translate = MBartForConditionalGeneration.from_pretrained(config['t2t'])
tokenizer = MBart50TokenizerFast.from_pretrained(config['t2t'], src_lang="en_XX")


In [None]:
audio_data, sampling_rate = librosa.load(config['audio-file'], sr=16000)

In [None]:
input_features = processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [None]:
transcription[0]

In [None]:
output = llm(f"Q:{transcription[0]}? A: ",max_tokens=100,stop=["Q:", "\n"], echo=True) #can change max_tokens
print(output)

In [None]:
a = (output["choices"][0]['text'].split('A:')[1])
steps = [step.strip() for step in a.split('. ')]

print('\n'.join(steps))


In [None]:
model_inputs = tokenizer(a, return_tensors="pt")

# translate from English to Hindi
generated_tokens = model_translate.generate(
    **model_inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)
res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [None]:
steps = [step.strip() for step in res[0].split('. ')]
print('\n'.join(steps))


In [None]:
# time_b=time()
# time_b-time_a