In [1]:
import torch
import transformers
import IPython
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import umap
import pandas as pd
import numpy as np
import umap.plot
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base", torch_dtype=torch.bfloat16).to("cuda")

In [3]:
ds = load_dataset("google/fleurs", "pt_br", split="train", streaming=True)
noise_ds = load_dataset("Myrtle/CAIMAN-ASR-BackgroundNoise")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


## Explore the whisper encoder embedding space 

### Explore the difference between noise and noise by whisper's UMAP reduced perception

In [4]:
# select 10 samples of noise and 10 samples of voice
n_samples = 100 
voice_meta_df = []
# voice_samples_dict = {}
keys_to_extract = ["transcription", "raw_transcription", "gender", "language"]

for i, j in zip(range(n_samples), ds):
    if i >= n_samples:
        break

    subset_dict = pd.DataFrame(dict((key, [j[key]]) for key in keys_to_extract))
    subset_dict["sample_id"] = i
    # voice_samples_dict[i] = j["audio"]["array"]
    subset_dict["array"] = [j["audio"]["array"]]
    voice_meta_df.append(subset_dict)

voice_meta_df = pd.concat(voice_meta_df)
voice_meta_df["type"] = "voice"

In [5]:
noise_samples = noise_ds["train"][:n_samples]
# noise_samples = [i["array"] for i in noise_samples["audio"]]
noise_df = pd.DataFrame({"type" : ["noise"] * n_samples, 
                         "sample_id" : list(range(n_samples,n_samples + n_samples)), 
                         "array": [i["array"] for i in noise_samples["audio"]]})
meta_df_all = pd.concat([voice_meta_df, noise_df]).reset_index()

In [6]:
def get_output_batch(features, model):
    features = features.to(device="cuda", dtype=torch.bfloat16)
    output = model.model.encoder(**features, output_hidden_states=True).last_hidden_state.to("cpu", dtype=torch.float16).detach().numpy()
    features = features.to(device="cpu", dtype=torch.bfloat16)
    torch.cuda.empty_cache()

    return output

def get_output_batch_per_batch(feat_array, model, batch_size=50):
    n_batch = int(np.ceil(len(feat_array) / batch_size))
    all_features = []
    
    for batch in list(range(n_batch)):
        start_batch = int(batch * batch_size)
        end_batch = int((batch + 1) * batch_size)
        batch_feat = feat_array[start_batch: end_batch]
        batch_feat = processor(batch_feat, sampling_rate=16000, return_tensors="pt", return_attention_mask=True)
        all_features.append(get_output_batch(batch_feat, model))
    
    all_embeddings = np.concatenate(all_features, axis=0)
    return all_embeddings


In [7]:
all_embeddings = get_output_batch_per_batch(meta_df_all["array"].to_list(), model)
all_embeddings = all_embeddings.reshape((all_embeddings.shape[0], int(all_embeddings.shape[1] * all_embeddings.shape[2])))

### Create UMAP object with all embeddings
Now we will create a UMAP object to reduce the dimmension of our embeddings to 2, this way we will be able to cluster the embeddings and visualize the clusters in a easy way.

In [None]:
# Create UMAP obejct
fit = umap.UMAP()

# Fits based in all embeddings
u = umap.UMAP().fit(all_embeddings)
umap.plot.output_notebook()

# Select columns to show as hover data
cols_to_show = ['transcription', 'raw_transcription', 'gender', 'language', 'sample_id', 'type']
p = umap.plot.interactive(u, hover_data=meta_df_all[cols_to_show], labels=meta_df_all["type"], point_size=10)
umap.plot.show(p)




#### Result analysis
Some voice samples ended up being inside the noise embedding cluster, which could mean that this voice sample has noise inside it or whisper encoder is not identifying it correctly. To test the both assumptions we will listen the audios and do an inference with whisper to check the performance. 

In [66]:
sample_id_check_list = [6, 16, 61, 53, 27, 96, 54]
samples_dict = meta_df_all.loc[meta_df_all.sample_id.isin(sample_id_check_list)]
samples_dict

Unnamed: 0,index,transcription,raw_transcription,gender,language,sample_id,array,type
6,0,hangeul é o único alfabeto inventado de propós...,Hangeul é o único alfabeto inventado de propós...,1.0,Portuguese,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice
16,0,a rm se baseia no fenômeno físico chamado ress...,A RM se baseia no fenômeno físico chamado ress...,1.0,Portuguese,16,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.9604644775390...",voice
27,0,a civilização do vale do indo foi uma civiliza...,A Civilização do Vale do Indo foi uma civiliza...,1.0,Portuguese,27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice
53,0,raymond damadian médico e cientista pesquisado...,"Raymond Damadian, médico e cientista pesquisad...",1.0,Portuguese,53,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice
54,0,a primeira-ministra em exercício julia gillard...,A Primeira-Ministra em exercício Julia Gillard...,1.0,Portuguese,54,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice
61,0,o canionismo combina elementos de natação esca...,"O canionismo combina elementos de natação, esc...",1.0,Portuguese,61,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice
96,0,já comunicando em inglês e japonês a organizaç...,"Já comunicando em inglês e japonês, a organiza...",1.0,Portuguese,96,"[0.0, -5.960464477539063e-08, -2.3841857910156...",voice


In [67]:
IPython.display.Audio(samples_dict["array"].iloc[1], rate=16000)

In [68]:
# load model and processor
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files

input_features = processor(samples_dict["array"].to_list(), sampling_rate=16000, return_tensors="pt").input_features 
input_features = input_features.to(device="cuda", dtype=torch.bfloat16)
# generate token ids
predicted_ids = model.generate(input_features).to("cuda")
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
torch.cuda.empty_cache()
samples_dict["whisper_transcription"] = transcription


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_dict["whisper_transcription"] = transcription


In [69]:
samples_dict

Unnamed: 0,index,transcription,raw_transcription,gender,language,sample_id,array,type,whisper_transcription
6,0,hangeul é o único alfabeto inventado de propós...,Hangeul é o único alfabeto inventado de propós...,1.0,Portuguese,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice,R.G.U. é um único alfabeto inventado de propó...
16,0,a rm se baseia no fenômeno físico chamado ress...,A RM se baseia no fenômeno físico chamado ress...,1.0,Portuguese,16,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.9604644775390...",voice,A R.M. se basinha no fenômeno físico chamado ...
27,0,a civilização do vale do indo foi uma civiliza...,A Civilização do Vale do Indo foi uma civiliza...,1.0,Portuguese,27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice,A civilização do Vale do Indo foi uma civiliz...
53,0,raymond damadian médico e cientista pesquisado...,"Raymond Damadian, médico e cientista pesquisad...",1.0,Portuguese,53,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice,"High Monde da Mádia, Remédico e Sintista Pesq..."
54,0,a primeira-ministra em exercício julia gillard...,A Primeira-Ministra em exercício Julia Gillard...,1.0,Portuguese,54,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice,"A primeira ministra em Execício, Julia, de La..."
61,0,o canionismo combina elementos de natação esca...,"O canionismo combina elementos de natação, esc...",1.0,Portuguese,61,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",voice,"O quenionismo combina elementos de natação, e..."
96,0,já comunicando em inglês e japonês a organizaç...,"Já comunicando em inglês e japonês, a organiza...",1.0,Portuguese,96,"[0.0, -5.960464477539063e-08, -2.3841857910156...",voice,"Já comunicando inglês e japonês, a organizaçã..."


### Check whisper output when subject to only noise 

In [70]:
sample_id_check_list = list(range(100,200,20))
samples_dict = meta_df_all.loc[meta_df_all.sample_id.isin(sample_id_check_list)]
samples_dict

Unnamed: 0,index,transcription,raw_transcription,gender,language,sample_id,array,type
100,0,,,,,100,"[-0.17913818359375, -0.26080322265625, -0.1835...",noise
120,20,,,,,120,"[-0.045440673828125, -0.08660888671875, -0.094...",noise
140,40,,,,,140,"[-0.015228271484375, -0.0252685546875, -0.0219...",noise
160,60,,,,,160,"[-0.260498046875, -0.531768798828125, -0.53912...",noise
180,80,,,,,180,"[0.01220703125, 0.032440185546875, 0.058502197...",noise


In [71]:
# load model and processor
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files

input_features = processor(samples_dict["array"].to_list(), sampling_rate=16000, return_tensors="pt").input_features 
input_features = input_features.to(device="cuda", dtype=torch.bfloat16)
# generate token ids
predicted_ids = model.generate(input_features).to("cuda")
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
torch.cuda.empty_cache()
samples_dict["whisper_transcription"] = transcription

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_dict["whisper_transcription"] = transcription


In [72]:
samples_dict

Unnamed: 0,index,transcription,raw_transcription,gender,language,sample_id,array,type,whisper_transcription
100,0,,,,,100,"[-0.17913818359375, -0.26080322265625, -0.1835...",noise,1 個 個 個 個 個 個 個 個 個 個 個 個 個 個 個 個 個 個 個 個 個 個...
120,20,,,,,120,"[-0.045440673828125, -0.08660888671875, -0.094...",noise,I'm going to go to the other side.
140,40,,,,,140,"[-0.015228271484375, -0.0252685546875, -0.0219...",noise,The
160,60,,,,,160,"[-0.260498046875, -0.531768798828125, -0.53912...",noise,1.5m de rin
180,80,,,,,180,"[0.01220703125, 0.032440185546875, 0.058502197...",noise,... ... ... ... ... ... ... ... ... ... ... ....


Whisper model do not reacts very well when subjected to only noise audio, it generated hallucination transcriptions in 4 of 5 cases.

TODO
- distinguish human voice from noise
- distinghish different human voices
- distinghish different phrases from the same human
- distinghish presense or not of noise
- LoRA whisper encoder to deal better with noise

explore:
- different embeddings
- statistical features