In [1]:
pip install datasets



In [2]:
pip install cohere



In [3]:
pip install -U langchain-community



In [105]:
pip install gradio==3.50.2




In [113]:
import torch
from transformers import pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base", device=device
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [114]:
from transformers import pipeline
import torchaudio
import numpy as np
def transcribe_audio(audio_data):
    if isinstance(audio_data, dict) and "array" in audio_data and "sampling_rate" in audio_data:
        audio_input = {
            "raw": np.array(audio_data["array"]),
            "sampling_rate": audio_data["sampling_rate"]
        }
        result = pipe(audio_input)
        transcription = result["text"]
        return transcription
    else:
        raise ValueError("Invalid audio data format. Must be a dictionary with 'array' and 'sampling_rate' keys.")


In [115]:
from datasets import load_dataset

dataset = load_dataset("facebook/voxpopuli", "it", split="validation", streaming=True)
sample = next(iter(dataset))

In [116]:
print(sample["audio"])

{'path': 'dev_part_0/20130522-0900-PLENARY-9-it_20130522-13:08:49_3.wav', 'array': array([-0.00064087,  0.02093506, -0.02679443, ..., -0.00280762,
       -0.00296021, -0.00247192]), 'sampling_rate': 16000}


In [117]:
print(sample)

{'audio_id': '20130522-0900-PLENARY-9-it_20130522-13:08:49_3', 'language': 5, 'audio': {'path': 'dev_part_0/20130522-0900-PLENARY-9-it_20130522-13:08:49_3.wav', 'array': array([-0.00064087,  0.02093506, -0.02679443, ..., -0.00280762,
       -0.00296021, -0.00247192]), 'sampling_rate': 16000}, 'raw_text': 'Penso che questo sia un passo in avanti importante nella costruzione di uno spazio giuridico di libertà di circolazione e di protezione dei diritti per le persone in Europa.', 'normalized_text': 'penso che questo sia un passo in avanti importante nella costruzione di uno spazio giuridico di libertà di circolazione e di protezione dei diritti per le persone in europa.', 'gender': 'female', 'speaker_id': '96917', 'is_gold_transcript': True, 'accent': 'None'}


In [118]:
transcribe_audio(sample["audio"])

' psicologico e sociale. Penso che sia un passo avanti importante nella costruzione di uno spazio juridico, di libertà di circolazione e di protezione dei diritti.'

# Text Translation

In [119]:
import cohere
from langchain.embeddings import CohereEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import Cohere as LangchainCohere
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

In [120]:
cohere_api_key = "ozX8ZzTAhkER4sOxLzQQvidjobghzyOg8Fouw824"

In [121]:
cohere_client = cohere.Client(cohere_api_key)
cohere_llm = LangchainCohere(cohere_api_key=cohere_api_key)
prompt_template = PromptTemplate(
    input_variables=["text"],
    template="Translate the following text to English:\n\n{text} and only give me the text"
)



In [122]:
from langchain_core.runnables import RunnableLambda
def translate_text(params):
    prompt = prompt_template.format(text=params["text"])
    response = cohere_llm(prompt)
    return response


In [123]:
translation_lambda = RunnableLambda(translate_text)

def translate(audio_sample):
    transcribed_text = transcribe_audio(audio_sample)
    print(transcribed_text)
    translated_text = translation_lambda.invoke({
        "text": transcribed_text
    })
    return translated_text

In [124]:
print(sample["audio"])

{'path': 'dev_part_0/20130522-0900-PLENARY-9-it_20130522-13:08:49_3.wav', 'array': array([-0.00064087,  0.02093506, -0.02679443, ..., -0.00280762,
       -0.00296021, -0.00247192]), 'sampling_rate': 16000}


In [125]:
translated_text = translate_audio(sample["audio"])
print(translated_text)

 psicologico e sociale. Penso che sia un passo avanti importante nella costruzione di uno spazio juridico, di libertà di circolazione e di protezione dei diritti.
 psychological and social. I believe it is an important step forward in building a legal space, circulation freedom, and protection of rights. 


# Text to Speech

In [128]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [129]:
model.to(device)
vocoder.to(device)

SpeechT5HifiGan(
  (conv_pre): Conv1d(80, 512, kernel_size=(7,), stride=(1,), padding=(3,))
  (upsampler): ModuleList(
    (0): ConvTranspose1d(512, 256, kernel_size=(8,), stride=(4,), padding=(2,))
    (1): ConvTranspose1d(256, 128, kernel_size=(8,), stride=(4,), padding=(2,))
    (2): ConvTranspose1d(128, 64, kernel_size=(8,), stride=(4,), padding=(2,))
    (3): ConvTranspose1d(64, 32, kernel_size=(8,), stride=(4,), padding=(2,))
  )
  (resblocks): ModuleList(
    (0): HifiGanResidualBlock(
      (convs1): ModuleList(
        (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
        (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
      )
      (convs2): ModuleList(
        (0-2): 3 x Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      )
    )
    (1): HifiGanResidualBlock(
      (convs1): ModuleList(
        (0): Conv1d(256, 256,

In [130]:
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [131]:
def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

In [134]:
import numpy as np

target_dtype = np.int16
max_range = np.iinfo(target_dtype).max

def load_audio_file(filepath):
    waveform, sample_rate = torchaudio.load(filepath)
    return {"array": waveform.squeeze().numpy(), "sampling_rate": sample_rate}


def speech_to_speech_translation(input_data):
    if isinstance(input_data, str):
        audio = load_audio_file(input_data)
    elif isinstance(input_data, dict):
        audio = input_data
    else:
        raise ValueError("Invalid input data. Must be a file path or a dictionary with audio data.")

    translated_text = translate_audio(audio)
    synthesised_speech = synthesise(translated_text)
    max_range = np.iinfo(np.int16).max
    synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
    return 16000, synthesised_speech


In [135]:
from IPython.display import Audio

sampling_rate, synthesised_speech = speech_to_speech_translation(sample["audio"])

Audio(synthesised_speech, rate=sampling_rate)

 psicologico e sociale. Penso che sia un passo avanti importante nella costruzione di uno spazio juridico, di libertà di circolazione e di protezione dei diritti.


In [None]:
!pip show gradio

In [None]:
import gradio as gr

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch(debug=True)



Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
IMPORTANT: You are using gradio version 3.50.2, however version 4.29.0 is available, please upgrade.
--------
IMPORTANT: You are using gradio version 3.50.2, however version 4.29.0 is available, please upgrade.
--------
IMPORTANT: You are using gradio version 3.50.2, however version 4.29.0 is available, please upgrade.
--------
IMPORTANT: You are using gradio version 3.50.2, however version 4.29.0 is available, please upgrade.
--------
Running on public URL: https://56970c5b45d424aa9d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




 you




 Bonjour, je viens de manger ma nourriture.


  return F.conv1d(input, weight, bias, self.stride,
