### **Sivakumar M (20MIA1002) VELLORE INSTITUTE OF TECHNOLOGY - CHENNAI**

## **Objective:** To bulid a Multilingual Speech Recognition Model for RAG without Training

# Langchain Dependencies


In [1]:
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-chroma langchain-groq sentence-transformers

In [2]:
import getpass
import os

In [3]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
os.environ["LANGCHAIN_API_KEY"] =  "<ENTER-YOURS>"

In [4]:
# import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initializing LLM

In [5]:
from groq import Groq
from langchain_groq import ChatGroq

llm = ChatGroq(temperature=0,
                      model_name="mixtral-8x7b-32768",
                      api_key= "<ENTER-YOURS>")

# Dataset Loading



In [6]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader # To use UnstructuredLoader
loader = DirectoryLoader("/content/", glob = "rag sample.txt", loader_cls = TextLoader)

In [7]:
docs = loader.load() # returns "Contants" as String and "Metadata" as Dictionary

# Splitting

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index = True) # spliting to chunks
splits = text_splitter.split_documents(docs)


# Storing

In [9]:
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})
vectorstore = Chroma.from_documents(documents=splits, embedding=embed_model)


  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Retrieving

In [10]:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

In [11]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Multilingual Query

In [12]:
!pip install transformers==4.40.0 accelerate datasets[audio]



In [13]:
# !pip install git+https://github.com/huggingface/accelerate

In [14]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import accelerate

In [15]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [16]:
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage = True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

In [26]:
result_fr = pipe("/content/question french alonso.mp3",generate_kwargs={"task": "translate"})

In [27]:
question_fr = result_fr["text"]

In [28]:
question_fr # Translated Text

' Why did Alonso get fired?'

In [31]:
response_fr = rag_chain.invoke(question_fr) # Retrieving the Generated Response from RAG

In [32]:
response_fr # Response Text

"Alonso was fired from Marvel Studios in March 2023 due to a breach of contract by serving as a producer on an Amazon Studios film, according to Disney. However, Alonso's lawyers refuted these claims, stating that Disney was aware of and agreed to her work on the film. The firing might have also been due to a disagreement with a Disney executive over censoring gay pride elements in a film for release in Kuwait. Alonso and Disney reached a multi-million dollar compensation settlement in April 2023."

# Evaluation - French


In [23]:
pip install rag-evaluator

Collecting rag-evaluator
  Downloading rag_evaluator-0.1.0-py3-none-any.whl (3.6 kB)
Collecting sacrebleu (from rag-evaluator)
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score (from rag-evaluator)
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score (from rag-evaluator)
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu->rag-evaluator)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu->rag-evaluator)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?

In [24]:
from rag_evaluator import RAGEvaluator
evaluator = RAGEvaluator()

# Grouth Truth
reference_fr = "Alonso was fired from her role at Marvel Studios in March 2023 for serving as a producer on the Amazon Studios film Argentina, 1985 (2022), in breach of a 2018 agreement between Alonso and Disney to not work for a competing studio."

metrics_fr = evaluator.evaluate_all(question_fr, response_fr, reference_fr)



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [25]:
for key, value in metrics_fr.items():
    print(f"{key}: {value}")

BLEU: 14.14472625743315
ROUGE-1: 0.4615384615384615
BERT P: 0.732024610042572
BERT R: 0.7911508083343506
BERT F1: 0.7604401111602783
Perplexity: 32.929473876953125
Diversity: 0.9651162790697675
Racial Bias: 0.49148473143577576


# Evaluation - German

In [33]:
result_gr = pipe("/content/question german committe.mp3",generate_kwargs={"task": "translate"})

In [34]:
question_gr = result_gr["text"]

In [35]:
question_gr # Translated Text

' Who are the six members of the Marvel Committee?'

In [36]:
response_gr = rag_chain.invoke(question_gr) # Retrieving the Generated Response from RAG

In [37]:
response_gr # Response Text

"The six members of the Marvel creative committee are Kevin Feige, Louis D'Esposito, Dan Buckley, Joe Quesada, Brian Michael Bendis, and Alan Fine. They were tasked with making key film decisions and preserving the artistic integrity of the Marvel Cinematic Universe. The committee was formed in 2007 and consisted of individuals familiar with Marvel's comic book lore."

In [40]:
# Grouth Truth
reference_gr = "Kevin Feige, Louis D'Esposito, Dan Buckley, Joe Quesada, Brian Michael Bendis, and Alan Fine who oversaw the committee."

metrics_gr = evaluator.evaluate_all(question_gr, response_gr, reference_gr)
for key, value in metrics_gr.items():
    print(f"{key}: {value}")



BLEU: 28.53380113326834
ROUGE-1: 0.43589743589743596
BERT P: 0.7232732772827148
BERT R: 0.8609854578971863
BERT F1: 0.786143958568573
Perplexity: 13.619458198547363
Diversity: 0.9473684210526315
Racial Bias: 0.474592000246048


# Evaluation - Hindi

In [57]:
result_hin = pipe("/content/question hindi 2009.mp3",generate_kwargs={"task": "translate"})

In [58]:
question_hin = result_hin["text"]

In [59]:
question_hin # Translated Text

' Who bought Marvel in 2009?'

In [60]:
response_hin = rag_chain.invoke(question_hin) # Retrieving the Generated Response from RAG

In [61]:
response_hin # Response Text

'The context does not explicitly state who bought Marvel in 2009, but it does mention that the Walt Disney Company purchased Marvel Entertainment for $4 billion in December 2009.'

In [62]:
# Grouth Truth
reference_hin = "In December 2009, the Walt Disney Company purchased Marvel Entertainment for $4 billion.."

metrics_hin = evaluator.evaluate_all(question_hin, response_hin, reference_hin)
for key, value in metrics_hin.items():
    print(f"{key}: {value}")



BLEU: 35.00608308203112
ROUGE-1: 0.6190476190476191
BERT P: 0.7778234481811523
BERT R: 0.8714852333068848
BERT F1: 0.8219948410987854
Perplexity: 23.759910583496094
Diversity: 0.9655172413793104
Racial Bias: 0.4822182357311249
