In [1]:
# Reference
# https://huggingface.co/facebook/seamless-m4t-v2-large
# https://github.com/protocolbuffers/protobuf/tree/main/python#installation

In [2]:
import warnings
import contextlib

import requests
from urllib3.exceptions import InsecureRequestWarning

old_merge_environment_settings = requests.Session.merge_environment_settings

@contextlib.contextmanager
def no_ssl_verification():
    opened_adapters = set()

    def merge_environment_settings(self, url, proxies, stream, verify, cert):
        # Verification happens only once per connection so we need to close
        # all the opened adapters once we're done. Otherwise, the effects of
        # verify=False persist beyond the end of this context manager.
        opened_adapters.add(self.get_adapter(url))

        settings = old_merge_environment_settings(self, url, proxies, stream, verify, cert)
        settings['verify'] = False

        return settings

    requests.Session.merge_environment_settings = merge_environment_settings

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', InsecureRequestWarning)
            yield
    finally:
        requests.Session.merge_environment_settings = old_merge_environment_settings

        for adapter in opened_adapters:
            try:
                adapter.close()
            except:
                pass

In [3]:
from google.protobuf.internal import api_implementation

In [23]:
from transformers import AutoProcessor, SeamlessM4Tv2Model
import torchaudio

with no_ssl_verification():
    processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
    model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")    

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
def translate(in_phrase, in_language, out_language):
    from transformers import AutoProcessor, SeamlessM4Tv2Model
    import torchaudio
    
    with no_ssl_verification():
        
        # from text
        text_inputs = processor(text = in_phrase, src_lang=in_language, return_tensors="pt")
        audio_array_from_text = model.generate(**text_inputs, tgt_lang=out_language)[0].cpu().numpy().squeeze()
        from IPython.display import Audio
        
        #sample_rate = model.sampling_rate
        sample_rate=14400
        return Audio(audio_array_from_text, rate=sample_rate)
        # Audio(audio_array_from_audio, rate=sample_rate)

In [41]:
out_audio = translate(in_phrase="my toy cat is cuddly but not everyone in the family likes her", \
                      in_language="eng", \
                      out_language="urd")

In [42]:
out_audio

In [43]:
from transformers import AutoProcessor, SeamlessM4Tv2Model
import torchaudio

with no_ssl_verification():
    
    # from audio
    audio, orig_freq =  torchaudio.load("./cats.wav")
    audio =  torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000) # must be a 16 kHz waveform array
    audio_inputs = processor(audios=audio, return_tensors="pt")
    audio_array_from_audio = model.generate(**audio_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze()

RuntimeError: Couldn't find appropriate backend to handle uri ./cats.wav and format None.

In [None]:
import scipy

sample_rate = model.sampling_rate
scipy.io.wavfile.write("out_from_text.wav", rate=sample_rate, data=audio_array_from_text)
