# Task 1 - MMS-LID models - Inference on audio samples, accuracy (predicted vs ground truth languages, performance analysis)

In [1]:
############## setup fairseq
import os

%cd /content
!git clone https://github.com/pytorch/fairseq

# Change current working directory
!pwd
%cd "/content/fairseq"
!pip install --editable ./
!pip install tensorboardX


############## MMS-LID - download
available_models = ["l126", "l256", "l512", "l1024", "l2048", "l4017"]

# We will use L126 model which can recognize 126 languages
model_name = available_models[0] # l126
print(f"Using model - {model_name}")
print(f"Visit https://dl.fbaipublicfiles.com/mms/lid/mms1b_{model_name}_langs.html to check all the languages supported by this model.")

! mkdir -p /content/models_lid
!wget -P /content/models_lid/{model_name} 'https://dl.fbaipublicfiles.com/mms/lid/mms1b_{model_name}.pt'
!wget -P /content/models_lid/{model_name} 'https://dl.fbaipublicfiles.com/mms/lid/dict/l126/dict.lang.txt'

/content
fatal: destination path 'fairseq' already exists and is not an empty directory.
/content
/content/fairseq
Obtaining file:///content/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: fairseq
  Building editable for fairseq (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fairseq: filename=fairseq-0.12.2-0.editable-cp310-cp310-linux_x86_64.whl size=9393 sha256=864d6a3b13271c5ae961a790fda5849e7d0467e8b857c8436f3eedfcef5b5b74
  Stored in directory: /tmp/pip-ephem-wheel-cache-2dhufh5l/wheels/c6/d7/db/bc419b1daa8266aa8de2a7c4d29f62dbfa814e8701fe4695a2
Successfully built fairseq
Installing collected packages: fairseq
  Attempting uninstall: fairseq
    Found existing installation: fairseq 0.12.2
    Uninstalling fairseq-0.12.

In [2]:
%cd "/content"

############## get english and native language (hindi) sentences and recordings - github
!rm -rf /content/audio_samples
!rm -rf /content/audio_sentences
!rm -rf /content/speech-understanding

!git clone https://github.com/shrivastava95/speech-understanding.git
!mkdir /content/audio_samples
!cp -r /content/speech-understanding/assignnments/PA1/recordings/converted /content/audio_samples
!mkdir /content/audio_sentences
!cp -r /content/speech-understanding/assignnments/PA1/sentences /content/audio_sentences

%cd "/content/fairseq"

/content
Cloning into 'speech-understanding'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 45 (delta 5), reused 40 (delta 3), pack-reused 0[K
Receiving objects: 100% (45/45), 5.88 MiB | 15.92 MiB/s, done.
Resolving deltas: 100% (5/5), done.
/content/fairseq


In [3]:
############ Running Inferences
import os

os.environ["PYTHONPATH"] = "/content/fairseq"
os.environ["PREFIX"] = "INFER"
os.environ["HYDRA_FULL_ERROR"] = "1"
os.environ["USER"] = "mms_lid_user"

In [4]:
!rm -rf /content/manifest

In [5]:
############## prepare manifest files
audio_samples_path = '/content/audio_samples/converted'
! mkdir -p /content/manifest/
import os
with open("/content/manifest/dev.tsv", "w") as ftsv, open("/content/manifest/dev.lang", "w") as flang:
  ftsv.write("/\n")

  for fl in os.listdir(audio_samples_path):
    if not fl.endswith(".wav"):
      continue
    if 'experiment' in 'fl':
      continue
    print(fl)
    audio_path = f"{audio_samples_path}/{fl}"
    # duration should be number of samples in audio. For inference, using a random value should be fine.
    duration = 1234
    ftsv.write(f"{audio_path}\t{duration}\n")
    flang.write("eng\n") # This is the "true" language for the audio. For inference, using a random value should be fine.

2hin.wav
2eng.wav
eng2_experiment1_ishaan.wav
eng1_experiment1_google.wav
eng1_experiment1_ishaan.wav
1hin.wav
eng2_experiment1_google.wav
1eng.wav


In [6]:
######### inference!!!
!python3 examples/mms/lid/infer.py /content/models_lid/{model_name} --path /content/models_lid/{model_name}/mms1b_l126.pt \
  --task audio_classification  --infer-manifest /content/manifest/dev.tsv --output-path /content/manifest/

2024-02-02 07:57:48.431544: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-02 07:57:48.431586: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-02 07:57:48.438816: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-02 07:57:48.457076: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
| loading model from /content/models_lid/l126/mms1b_l

In [7]:
print("----- INPUT FILES -----")
! tail -n +2 /content/manifest/dev.tsv

print("\n----- TOP-K PREDICTONS WITH SCORE -----")
! cat /content/manifest//predictions.txt

----- INPUT FILES -----
/content/audio_samples/converted/2hin.wav	1234
/content/audio_samples/converted/2eng.wav	1234
/content/audio_samples/converted/eng2_experiment1_ishaan.wav	1234
/content/audio_samples/converted/eng1_experiment1_google.wav	1234
/content/audio_samples/converted/eng1_experiment1_ishaan.wav	1234
/content/audio_samples/converted/1hin.wav	1234
/content/audio_samples/converted/eng2_experiment1_google.wav	1234
/content/audio_samples/converted/1eng.wav	1234

----- TOP-K PREDICTONS WITH SCORE -----
[["hin", 0.999890148639679], ["urd", 2.9325847208383493e-05], ["pan", 1.2815166883228812e-05]]
[["eng", 0.9943107962608337], ["glv", 0.001171866082586348], ["hin", 0.0008264650823548436]]
[["eng", 0.9730477333068848], ["lat", 0.005167855881154537], ["glv", 0.004020326305180788]]
[["eng", 0.999755322933197], ["fas", 5.6795772252371535e-05], ["spa", 4.291595905669965e-05]]
[["eng", 0.9859381318092346], ["hin", 0.002462681382894516], ["urd", 0.0017371205613017082]]
[["hin", 0.99896

### Experiment 1: reducing the length of the audio recordings to 6 seconds each to see effects on the confidence scores

In [8]:
######## experiment1: reducing the lenght of the audio recordings to 6 seconds each to see effects on the confidence scores
!ffmpeg -i /content/audio_samples/converted/1eng.wav -t 00:00:06 -c copy /content/audio_samples/converted/1eng_cropped.wav
!ffmpeg -i /content/audio_samples/converted/1hin.wav -t 00:00:06 -c copy /content/audio_samples/converted/1hin_cropped.wav
!ffmpeg -i /content/audio_samples/converted/2eng.wav -t 00:00:06 -c copy /content/audio_samples/converted/2eng_cropped.wav
!ffmpeg -i /content/audio_samples/converted/2hin.wav -t 00:00:06 -c copy /content/audio_samples/converted/2hin_cropped.wav

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [9]:
!rm -rf /content/manifest

In [10]:
############## prepare manifest files
audio_samples_path = '/content/audio_samples/converted'
! mkdir -p /content/manifest/
import os
with open("/content/manifest/dev.tsv", "w") as ftsv, open("/content/manifest/dev.lang", "w") as flang:
  ftsv.write("/\n")

  for fl in os.listdir(audio_samples_path):
    if not fl.endswith(".wav"):
      continue
    if not 'cropped' in fl:
      continue
    if 'experiment' in fl:
      continue
    print(fl)
    audio_path = f"{audio_samples_path}/{fl}"
    # duration should be number of samples in audio. For inference, using a random value should be fine.
    duration = 1234
    ftsv.write(f"{audio_path}\t{duration}\n")
    flang.write("eng\n") # This is the "true" language for the audio. For inference, using a random value should be fine.

1hin_cropped.wav
1eng_cropped.wav
2hin_cropped.wav
2eng_cropped.wav


In [11]:
######### inference!!!
!python3 examples/mms/lid/infer.py /content/models_lid/{model_name} --path /content/models_lid/{model_name}/mms1b_l126.pt \
  --task audio_classification  --infer-manifest /content/manifest/dev.tsv --output-path /content/manifest/

2024-02-02 07:59:18.688440: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-02 07:59:18.688487: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-02 07:59:18.696525: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-02 07:59:18.713754: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
| loading model from /content/models_lid/l126/mms1b_l

In [12]:
print("----- INPUT FILES -----")
! tail -n +2 /content/manifest/dev.tsv

print("\n----- TOP-K PREDICTONS WITH SCORE -----")
! cat /content/manifest//predictions.txt

----- INPUT FILES -----
/content/audio_samples/converted/1hin_cropped.wav	1234
/content/audio_samples/converted/1eng_cropped.wav	1234
/content/audio_samples/converted/2hin_cropped.wav	1234
/content/audio_samples/converted/2eng_cropped.wav	1234

----- TOP-K PREDICTONS WITH SCORE -----
[["hin", 0.9932485818862915], ["urd", 0.005435482133179903], ["mar", 0.000582613458391279]]
[["eng", 0.870598316192627], ["kan", 0.039448246359825134], ["hin", 0.02283794991672039]]
[["hin", 0.9997424483299255], ["san", 6.999270408414304e-05], ["mar", 5.366690311348066e-05]]
[["eng", 0.9650427103042603], ["hin", 0.01581401377916336], ["urd", 0.006869655102491379]]


### Experiment 2: effect of accent bias on english confidence scores

In [None]:
### Experiment 2: effect of accent bias on english confidence scores


In [13]:
!rm -rf /content/manifest

In [14]:
############## prepare manifest files
audio_samples_path = '/content/audio_samples/converted'
! mkdir -p /content/manifest/
import os
with open("/content/manifest/dev.tsv", "w") as ftsv, open("/content/manifest/dev.lang", "w") as flang:
  ftsv.write("/\n")

  for fl in os.listdir(audio_samples_path):
    if not fl.endswith(".wav"):
      continue
    if not 'experiment' in fl:
      continue
    print(fl)
    audio_path = f"{audio_samples_path}/{fl}"
    # duration should be number of samples in audio. For inference, using a random value should be fine.
    duration = 1234
    ftsv.write(f"{audio_path}\t{duration}\n")
    flang.write("eng\n") # This is the "true" language for the audio. For inference, using a random value should be fine.

eng2_experiment1_ishaan.wav
eng1_experiment1_google.wav
eng1_experiment1_ishaan.wav
eng2_experiment1_google.wav


In [15]:
######### inference!!!
!python3 examples/mms/lid/infer.py /content/models_lid/{model_name} --path /content/models_lid/{model_name}/mms1b_l126.pt \
  --task audio_classification  --infer-manifest /content/manifest/dev.tsv --output-path /content/manifest/

2024-02-02 08:00:31.483000: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-02 08:00:31.483058: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-02 08:00:31.485166: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-02 08:00:31.494392: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
| loading model from /content/models_lid/l126/mms1b_l

In [16]:
print("----- INPUT FILES -----")
! tail -n +2 /content/manifest/dev.tsv

print("\n----- TOP-K PREDICTONS WITH SCORE -----")
! cat /content/manifest//predictions.txt

----- INPUT FILES -----
/content/audio_samples/converted/eng2_experiment1_ishaan.wav	1234
/content/audio_samples/converted/eng1_experiment1_google.wav	1234
/content/audio_samples/converted/eng1_experiment1_ishaan.wav	1234
/content/audio_samples/converted/eng2_experiment1_google.wav	1234

----- TOP-K PREDICTONS WITH SCORE -----
[["eng", 0.9730477333068848], ["lat", 0.005167855881154537], ["glv", 0.004020326305180788]]
[["eng", 0.999755322933197], ["fas", 5.6795772252371535e-05], ["spa", 4.291595905669965e-05]]
[["eng", 0.9859381318092346], ["hin", 0.002462681382894516], ["urd", 0.0017371205613017082]]
[["eng", 0.9993102550506592], ["fas", 0.0002330297138541937], ["spa", 0.0001402778725605458]]


# Task 2 - MMS-TTS models - speech generation [English, Native]

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Installing collected 

### Audio and transcription files

In [None]:
# !ls speech-understanding/assignnments/PA1/recordings/converted

%cd /content
!rm -rf /content/audio_samples
!rm -rf /content/audio_sentences
!rm -rf speech-understanding/
!mkdir /content/audio_samples
!mkdir /content/audio_sentences
!git clone https://github.com/shrivastava95/speech-understanding.git
!cp -r speech-understanding/assignnments/PA1/recordings/converted/*.wav /content/audio_samples
!cp -r speech-understanding/assignnments/PA1/sentences/sentences.py /content/audio_sentences/
!ls /content/audio_samples

/content
Cloning into 'speech-understanding'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 45 (delta 5), reused 40 (delta 3), pack-reused 0[K
Receiving objects: 100% (45/45), 5.88 MiB | 15.40 MiB/s, done.
Resolving deltas: 100% (5/5), done.
1eng.wav  2eng.wav  eng1_experiment1_google.wav  eng2_experiment1_google.wav
1hin.wav  2hin.wav  eng1_experiment1_ishaan.wav  eng2_experiment1_ishaan.wav


### Loading TTS model:

In [12]:
%cd /content

/content


In [13]:
!pip install librosa



In [14]:
from audio_sentences import sentences

In [15]:
import torch
from transformers import VitsTokenizer, VitsModel, set_seed
import scipy
import librosa
import torch
from scipy.io import wavfile


def resample_waveform(waveform, current_rate, target_rate):
    return librosa.resample(waveform, orig_sr=current_rate, target_sr=target_rate)

def synthesize_speech(model, tokenizer, text, file_path, sampling_rate=16000):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    waveform = outputs.waveform[0].numpy()
    wavfile.write(file_path, rate=model.config.sampling_rate, data=waveform)


tokenizer1 = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model1 = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer2 = VitsTokenizer.from_pretrained("facebook/mms-tts-hin")
model2 = VitsModel.from_pretrained("facebook/mms-tts-hin")

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/mms-tts-eng were not used when initializing VitsModel: ['flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'posterior_encoder.wavenet.in_layers.7.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'posterior_encoder.wavenet.res_skip_layers.5.weight_v', 'posterior_encoder.wavenet.res_skip_layers.8.weight_g', 'flow.flows.1.wavenet.res_skip_layers.2.weight_g', 'posterior_encoder.wavenet.res_skip_layers.7.weight_v', 'posterior_encoder.wavenet.in_layers.1.weight_g', 'flow.flows.2.wavenet.res_skip_layers.1.weight_v', 'posterior_encoder.wavenet.res_skip_layers.12.weight_v', 'flow.flows.1.wavenet.in_layers.3.weight_v', 'posterior_encoder.wavenet.res_skip_layers.5.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'posterior_encoder.wavenet.in_layers.5.weight_v', 'posterior_encoder.wavenet.in_layers.8.weight_v', 'flow.flows.2.wavenet.res_skip_layers.2.weight_v', 'posterior_encoder

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/mms-tts-hin were not used when initializing VitsModel: ['flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'posterior_encoder.wavenet.in_layers.7.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'posterior_encoder.wavenet.res_skip_layers.5.weight_v', 'posterior_encoder.wavenet.res_skip_layers.8.weight_g', 'flow.flows.1.wavenet.res_skip_layers.2.weight_g', 'posterior_encoder.wavenet.res_skip_layers.7.weight_v', 'posterior_encoder.wavenet.in_layers.1.weight_g', 'flow.flows.2.wavenet.res_skip_layers.1.weight_v', 'posterior_encoder.wavenet.res_skip_layers.12.weight_v', 'flow.flows.1.wavenet.in_layers.3.weight_v', 'posterior_encoder.wavenet.res_skip_layers.5.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'posterior_encoder.wavenet.in_layers.5.weight_v', 'posterior_encoder.wavenet.in_layers.8.weight_v', 'flow.flows.2.wavenet.res_skip_layers.2.weight_v', 'posterior_encoder

In [16]:
samples = sentences.sentences

In [17]:
samples['1eng']

"The moment I opened the window, I got to know that it was not a boy but a girl. She can mimic several male actors precisely. It doesn't seem/look like somebody else's voice. When I heard her mimicry for the first time, I was speechless. Ever since she has been in the industry, I'm her fan."

In [27]:
import json
import IPython

print(json.dumps(indent=4, obj=samples))

{
    "1eng": "The moment I opened the window, I got to know that it was not a boy but a girl. She can mimic several male actors precisely. It doesn't seem/look like somebody else's voice. When I heard her mimicry for the first time, I was speechless. Ever since she has been in the industry, I'm her fan.",
    "1hin": "\u0916\u093f\u0921\u093c\u0915\u0940 \u0916\u094b\u0932\u0924\u0947 \u0939\u0940 \u092e\u0941\u091d\u0947 \u092a\u0924\u093e \u091a\u0932 \u0917\u092f\u093e \u0915\u093f \u0935\u094b \u0915\u094b\u0908 \u0932\u0921\u093c\u0915\u093e \u0928\u0939\u0940\u0902 \u092c\u0932\u094d\u0915\u093f \u0932\u0921\u093c\u0915\u0940 \u0939\u0948\u0964 \u0935\u094b \u0915\u0908 \u092a\u0941\u0930\u0942\u0937 \u0905\u092d\u093f\u0928\u0947\u0924\u093e\u0913\u0902 \u0915\u0940 \u0939\u0942\u092c\u0939\u0942 \u0928\u0915\u0932 \u0915\u0930 \u0938\u0915\u0924\u0940 \u0939\u0948\u0964 \u0932\u0917\u0924\u093e \u0939\u0940 \u0928\u0939\u0940\u0902 \u0915\u093f \u0906\u0935\u093e\u091c \u0915\

In [19]:
tok_models = {
    '1eng': [tokenizer1, model1],
    '2eng': [tokenizer1, model1],
    '1hin': [tokenizer2, model2],
    '2hin': [tokenizer2, model2],
}

In [20]:
savepaths = {
    '1eng': 'audio_samples/1eng_tts.wav',
    '2eng': 'audio_samples/2eng_tts.wav',
    '1hin': 'audio_samples/1hin_tts.wav',
    '2hin': 'audio_samples/2hin_tts.wav',
}

In [21]:
for key in samples:
  tok, mod = tok_models[key]
  sample = samples[key]
  savepath = savepaths[key]
  print(sample)
  synthesize_speech(mod, tok, sample, savepath)
  print('sample done!')


The moment I opened the window, I got to know that it was not a boy but a girl. She can mimic several male actors precisely. It doesn't seem/look like somebody else's voice. When I heard her mimicry for the first time, I was speechless. Ever since she has been in the industry, I'm her fan.
sample done!
खिड़की खोलते ही मुझे पता चल गया कि वो कोई लड़का नहीं बल्कि लड़की है। वो कई पुरूष अभिनेताओं की हूबहू नकल कर सकती है। लगता ही नहीं कि आवाज किसी और की है। जब मैंने पहली बार उसकी मिमिक्री सुनी, तो मैं पागल हो गया। जब से वो इंडस्ट्री में है, मैं उसका फैन हूं।
sample done!
Autonomy in academic matters does not mean that universities should be oblivious of special need. In fact, universities are set up for the satisfaction of certain felt needs of society and they have to be fully sensitive and responsive to them.
sample done!
शिक्षा में स्वायत्ता का अर्थ यह नहीं है कि विश्विद्यालय विशिष्ट आवश्यकताओं के प्रति ध्यान ही न दें।  वस्तुतः विश्विद्यालयों की स्थापना समाज की कुछ आवश्यकताओं को पूरा करने

In [28]:
IPython.display.Audio(savepaths[list(savepaths.keys())[0]])

In [29]:
IPython.display.Audio(savepaths[list(savepaths.keys())[1]])

In [31]:
IPython.display.Audio(savepaths[list(savepaths.keys())[2]])

In [30]:
IPython.display.Audio(savepaths[list(savepaths.keys())[3]])

# Task 3 - MMS-ASR models - task1 transcription performance (CER, WER for recording vs generated audio transcriptions, comparison between English and Native)

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
Installing collected 

### Audio and Transcription files
I have uploaded the audio files after conversio to the required `.wav` format on my github, along with their transcriptions.

In [3]:
# !ls speech-understanding/assignnments/PA1/recordings/converted

%cd /content
!rm -rf /content/audio_samples
!rm -rf /content/audio_sentences
!rm -rf speech-understanding/
!mkdir /content/audio_samples
!mkdir /content/audio_sentences
!git clone https://github.com/shrivastava95/speech-understanding.git
!cp -r speech-understanding/assignnments/PA1/recordings/converted/*.wav /content/audio_samples
!cp -r speech-understanding/assignnments/PA1/sentences/sentences.py /content/audio_sentences/
!ls /content/audio_samples

/content
Cloning into 'speech-understanding'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 45 (delta 5), reused 40 (delta 3), pack-reused 0[K
Receiving objects: 100% (45/45), 5.88 MiB | 19.11 MiB/s, done.
Resolving deltas: 100% (5/5), done.
1eng.wav  2eng.wav  eng1_experiment1_google.wav  eng2_experiment1_google.wav
1hin.wav  2hin.wav  eng1_experiment1_ishaan.wav  eng2_experiment1_ishaan.wav


In [4]:
######## experiment1: reducing the lenght of the audio recordings to 6 seconds each to see effects on the confidence scores
!ffmpeg -i /content/audio_samples/1eng.wav -t 00:00:06 -c copy /content/audio_samples/1eng_cropped.wav
!ffmpeg -i /content/audio_samples/1hin.wav -t 00:00:06 -c copy /content/audio_samples/1hin_cropped.wav
!ffmpeg -i /content/audio_samples/2eng.wav -t 00:00:06 -c copy /content/audio_samples/2eng_cropped.wav
!ffmpeg -i /content/audio_samples/2hin.wav -t 00:00:06 -c copy /content/audio_samples/2hin_cropped.wav

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

### Loading ASR model: MMS-FL102
Facebook MMS ASR model capable of speech recognition across 102 languages. Uses an adapter for changing the language settings. Trained on FLEURS dataset.

In [5]:
from transformers import Wav2Vec2ForCTC, AutoProcessor
import torch

model_id = 'facebook/mms-1b-fl102'
# model_id = 'facebook/mms-1b-l1107'
# model_id = 'facebook/mms-1b-all'

processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/351k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/mms-1b-fl102 were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/mms-1b-fl102 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-st

In [6]:
!pip install jiwer

import jiwer
from tqdm import tqdm

def load_audio_dataset(wav_paths):
  data = Dataset.from_dict({"audio": wav_paths}).cast_column("audio", Audio(sampling_rate=16000))
  return data

def change_lang(model, processor, lang='eng'):
  processor.tokenizer.set_target_lang(lang)
  model.load_adapter(lang)

def wer(reference, hypothesis):
  return jiwer.wer(reference, hypothesis)

def cer(reference, hypothesis):
  reference = ' '.join([ri for ri in reference.replace(' ', '')])
  hypothesis = ' '.join([hi for hi in hypothesis.replace(' ', '')])
  return jiwer.wer(reference, hypothesis)

def inference_asr(model, processor, wav_paths):
  audio_dataset = load_audio_dataset(wav_paths)
  inferences = {}
  for i, audio_sample in tqdm(list(enumerate(audio_dataset))):
    inputs = processor(audio_sample['audio']['array'], sampling_rate=16_000, return_sensors="pt")
    with torch.no_grad():
      outputs = model(**inputs).logits
    ids = torch.argmax(outputs, dim=-1)[0]
    transcription = processor.decode(ids)
    inferences[wav_paths[i]] = transcription
  return inferences

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.6.1


In [10]:
from datasets import load_dataset, Audio, Dataset
import os
import json
from audio_sentences import sentences

savepaths = {
    '1eng': 'audio_samples/1eng_tts.wav',
    '2eng': 'audio_samples/2eng_tts.wav',
    '1hin': 'audio_samples/1hin_tts.wav',
    '2hin': 'audio_samples/2hin_tts.wav',
}

parent_audio_dir = '/content/audio_samples'
audio_paths_exp0 = [
    os.path.join(parent_audio_dir, apath)
    for apath in os.listdir(parent_audio_dir)
    if len(apath) == 8 and apath.endswith('.wav')
]
audio_paths_tts = [
    savepaths[key] for key in savepaths
]
audio_paths_exp1 = [
    os.path.join(parent_audio_dir, apath)
    for apath in os.listdir(parent_audio_dir)
    if 'cropped' in apath and apath.endswith('.wav')
]
audio_paths_exp2 = [
    os.path.join(parent_audio_dir, apath)
    for apath in os.listdir(parent_audio_dir)
    if 'experiment' in apath and apath.endswith('.wav')
]
print('standard audio samples:', json.dumps(indent=4, obj=audio_paths_exp0))
print('tts audio samples:', json.dumps(indent=4, obj=audio_paths_tts))
print('experiment 1:', json.dumps(indent=4, obj=audio_paths_exp1))
print('experiment 2:', json.dumps(indent=4, obj=audio_paths_exp2))

standard audio samples: [
    "/content/audio_samples/2hin.wav",
    "/content/audio_samples/2eng.wav",
    "/content/audio_samples/1hin.wav",
    "/content/audio_samples/1eng.wav"
]
tts audio samples: [
    "audio_samples/1eng_tts.wav",
    "audio_samples/2eng_tts.wav",
    "audio_samples/1hin_tts.wav",
    "audio_samples/2hin_tts.wav"
]
experiment 1: [
    "/content/audio_samples/1hin_cropped.wav",
    "/content/audio_samples/1eng_cropped.wav",
    "/content/audio_samples/2hin_cropped.wav",
    "/content/audio_samples/2eng_cropped.wav"
]
experiment 2: [
    "/content/audio_samples/eng2_experiment1_ishaan.wav",
    "/content/audio_samples/eng1_experiment1_google.wav",
    "/content/audio_samples/eng1_experiment1_ishaan.wav",
    "/content/audio_samples/eng2_experiment1_google.wav"
]


### ASR for 4 original and 4 tts sentence files

In [8]:
inferences_original = {}
for i, audio_sample in tqdm(list(enumerate(load_audio_dataset(audio_paths_exp0)))):
  if 'eng' in audio_paths_exp0[i]:
    lang = 'eng'
  else:
    lang = 'hin'
  change_lang(model, processor, lang)
  inputs = processor(audio_sample['audio']['array'], sampling_rate=16000, return_tensors='pt')
  with torch.no_grad():
      output = model(**inputs).logits
  ids = torch.argmax(output, dim=-1)[0]
  transcription = processor.decode(ids)
  inferences_original[audio_paths_exp0[i]] = transcription

  0%|          | 0/4 [00:00<?, ?it/s]

adapter.hin.safetensors:   0%|          | 0.00/9.26M [00:00<?, ?B/s]

 25%|██▌       | 1/4 [00:40<02:00, 40.30s/it]

adapter.eng.safetensors:   0%|          | 0.00/9.04M [00:00<?, ?B/s]

100%|██████████| 4/4 [02:14<00:00, 33.71s/it]


In [32]:
inferences_tts = {}
for i, audio_sample in tqdm(list(enumerate(load_audio_dataset(audio_paths_tts)))):
  if 'eng' in audio_paths_tts[i]:
    lang = 'eng'
  else:
    lang = 'hin'
  change_lang(model, processor, lang)
  inputs = processor(audio_sample['audio']['array'], sampling_rate=16000, return_tensors='pt')
  with torch.no_grad():
      output = model(**inputs).logits
  ids = torch.argmax(output, dim=-1)[0]
  transcription = processor.decode(ids)
  inferences_tts[audio_paths_tts[i]] = transcription

100%|██████████| 4/4 [02:18<00:00, 34.66s/it]


In [33]:
references_original = [
    sentences.sentences['2hin'],
    sentences.sentences['2eng'],
    sentences.sentences['1hin'],
    sentences.sentences['1eng'],
]
references_tts = [
    sentences.sentences['1eng'],
    sentences.sentences['2eng'],
    sentences.sentences['1hin'],
    sentences.sentences['2hin'],
]

In [34]:
for i, key in list(enumerate(inferences_original.keys())):
  hypothesis = inferences_original[key]
  gt = references_original[i]
  cer_score = cer(gt, hypothesis)
  wer_score = wer(gt, hypothesis)
  print(f'{audio_paths_exp0[i]}:')
  print('reference: ', gt)
  print('predicted: ', hypothesis)
  print('cer score:', cer_score)
  print('wer score:', wer_score)
  print()

/content/audio_samples/2hin.wav:
reference:  शिक्षा में स्वायत्ता का अर्थ यह नहीं है कि विश्विद्यालय विशिष्ट आवश्यकताओं के प्रति ध्यान ही न दें।  वस्तुतः विश्विद्यालयों की स्थापना समाज की कुछ आवश्यकताओं को पूरा करने के लिए हुई है और इन्हें इन आवश्यकताओं को पूरा करने के लिए सजग रहना चाहिए।
predicted:  शिक्षा में स्वायत्ता का अर्थ यह नहीं है कि विद्यालय विशिष्ट आवश्यकताओं के प्रतिध्यान ही न दें वस्तुत विद्यालयों की सुथापना समाझ की कुछ आवश्यकताओं को पूरा करने के लिए हुई है और इन्हें इन आवश्यकताओं को पूरा करने के लिए सजक रहना चाहिए
cer score: 0.07035175879396985
wer score: 0.2222222222222222

/content/audio_samples/2eng.wav:
reference:  Autonomy in academic matters does not mean that universities should be oblivious of special need. In fact, universities are set up for the satisfaction of certain felt needs of society and they have to be fully sensitive and responsive to them.
predicted:  autonomy and academic maters doesnot mean that universities should be obsevious of special need in fac

In [35]:
for i, key in list(enumerate(inferences_tts.keys())):
  hypothesis = inferences_tts[key]
  gt = references_tts[i]
  cer_score = cer(gt, hypothesis)
  wer_score = wer(gt, hypothesis)
  print(f'{audio_paths_tts[i]}:')
  print('reference: ', gt)
  print('predicted: ', hypothesis)
  print('cer score:', cer_score)
  print('wer score:', wer_score)
  print()

audio_samples/1eng_tts.wav:
reference:  The moment I opened the window, I got to know that it was not a boy but a girl. She can mimic several male actors precisely. It doesn't seem/look like somebody else's voice. When I heard her mimicry for the first time, I was speechless. Ever since she has been in the industry, I'm her fan.
predicted:  the moment i opened the window i got to know that it was not a boy but a gorl she can mimit several mail actors precisely it doesn't see look like somebody els voice when i herd her mintot cry for the first time i was speachlous heve her since she has been in the industry i am her fan
cer score: 0.16170212765957448
wer score: 0.5

audio_samples/2eng_tts.wav:
reference:  Autonomy in academic matters does not mean that universities should be oblivious of special need. In fact, universities are set up for the satisfaction of certain felt needs of society and they have to be fully sensitive and responsive to them.
predicted:  that onmy necademi matters 