In [None]:
!pip install gradio
!pip install git+https://github.com/huggingface/parler-tts.git

Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.2 (from gradio)
  Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:

import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from parler_tts import ParlerTTSForConditionalGeneration
import torch
import soundfile as sf
import os
from transformers import pipeline


# # Initialize Whisper model
# whisper_processor = WhisperProcessor.from_pretrained("yash072/Whisper-small-finetuned-hindi")
# whisper_model = WhisperForConditionalGeneration.from_pretrained("yash072/Whisper-small-finetuned-hindi")
# print("whisper loader for STT")


## loading wishper model
model_id = "yash072/Whisper_Smal_FineTuned_Hindi"  # update with your model id
pipe = pipeline("automatic-speech-recognition", model=model_id)


# Initialize IndicBERT model
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert")
indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
print("indic bert loader for Text to text ")
# Initialize Parler TTS
indic_parler_tts = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts")
indic_parler_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(indic_parler_tts.config.text_encoder._name_or_path)
print("text to speech")

# Device setup
device = "cuda:0" if torch.cuda.is_available() else "cpu"
indic_parler_tts = indic_parler_tts.to(device)

# def speech_to_text(audio_input):
#     # Convert speech to text using Whisper
#     audio_input, _ = sf.read(audio_input)  # Read the audio file
#     input_features = whisper_processor(audio_input, return_tensors="pt").input_features
#     predicted_ids = whisper_model.generate(input_features)
#     text = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True)
#     return text

def transcribe_speech(filepath):
    output = pipe(
        filepath,
        max_new_tokens=256,
        generate_kwargs={
            "task": "transcribe",
            "language": "hi",
        },  # update with the language you've fine-tuned on
        chunk_length_s=30,
        batch_size=8,
    )
    return output["text"]

def process_query(text):
    # Use IndicBERT to process the text query
    inputs = indicbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = indicbert_model(**inputs)
    response = torch.argmax(outputs.logits, dim=1).item()  # Get the most probable response class
    response_text = f"Predicted class: {response}"  # Placeholder response
    return response_text

def text_to_speech(response_text):
    # Convert text to speech using Indic Parler TTS
    prompt = response_text
    description = "A female speaker delivers a clear, natural tone in moderate speed and pitch."
    description_input_ids = description_tokenizer(description, return_tensors="pt").to(device)
    prompt_input_ids = indic_parler_tokenizer(prompt, return_tensors="pt").to(device)

    generation = indic_parler_tts.generate(
        input_ids=description_input_ids.input_ids,
        attention_mask=description_input_ids.attention_mask,
        prompt_input_ids=prompt_input_ids.input_ids,
        prompt_attention_mask=prompt_input_ids.attention_mask
    )
    audio_arr = generation.cpu().numpy().squeeze()

    # Save the output audio to disk
    output_file = "tts_output.wav"
    sf.write(output_file, audio_arr, indic_parler_tts.config.sampling_rate)

    return output_file

# Full pipeline function
def multimodal_pipeline(audio_input):
    # Step 1: Convert speech to text
    text_query = transcribe_speech(audio_input)

    # Step 2: Process the text query to get a response
    response_text = process_query(text_query)

    # Step 3: Convert the response text to speech and save audio
    audio_response_path = text_to_speech(response_text)

    return text_query, response_text, audio_response_path

# Create a Gradio interface for the multimodal pipeline
iface = gr.Interface(
    fn=multimodal_pipeline,
    inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
    outputs=[
        gr.Textbox(label="Transcribed Text"),
        gr.Textbox(label="LLM Response"),
        gr.Audio(label="Generated Audio Response")
    ],
    title="E-commerce Query Solution",
    description="Upload or record an audio query in Hindi to get a spoken response. Intermediate outputs such as transcribed text and LLM response are displayed."
)

# Launch the Gradio interface
iface.launch()


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


indic bert loader for Text to text 


  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

  "_name_or_path": "ylacombe/dac_44khz",
  "architectures": [
    "DacModel"
  ],
  "codebook_dim": 8,
  "codebook_loss_weight": 1.0,
  "codebook_size": 1024,
  "commitment_loss_weight": 0.25,
  "decoder_hidden_si

text to speech
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ec3215c85ff775ae2b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
text_query = speech_to_text("/content/indic_tts_out2.wav")

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [None]:
text_query


'अरे, पिल्याच के साई पिल्याडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाडियाड'

In [None]:

from transformers import pipeline

model_id = "yash072/Whisper_Smal_FineTuned_Hindi"  # update with your model id
pipe = pipeline("automatic-speech-recognition", model=model_id)

config.json:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.86k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
def transcribe_speech(filepath):
    output = pipe(
        filepath,
        max_new_tokens=256,
        generate_kwargs={
            "task": "transcribe",
            "language": "hi",
        },  # update with the language you've fine-tuned on
        chunk_length_s=30,
        batch_size=8,
    )
    return output["text"]

In [None]:
text = transcribe_speech("/content/indic_tts_out2.wav")

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.


In [None]:
print(text)

अरे, तुम आच कैसे हो तुम्रे नाम क्या है?


In [None]:
pip install git+https://github.com/huggingface/parler-tts.git

Collecting git+https://github.com/huggingface/parler-tts.git
  Cloning https://github.com/huggingface/parler-tts.git to /tmp/pip-req-build-aa4o2nfn
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/parler-tts.git /tmp/pip-req-build-aa4o2nfn
  Resolved https://github.com/huggingface/parler-tts.git to commit d108732cd57788ec86bc857d99a6cabd66663d68
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting descript-audiotools@ git+https://github.com/descriptinc/audiotools (from parler_tts==0.2.2)
  Cloning https://github.com/descriptinc/audiotools to /tmp/pip-install-ukr3_s_e/descript-audiotools_e99b5d8a19ba4cd28a84cffb37184197
  Running command git clone --filter=blob:none --quiet https://github.com/descriptinc/audiotools /tmp/pip-install-ukr3_s_e/descript-audiotools_e99b5d8a19ba4cd28a84cffb37184197
  Resolved https://github.com/d

In [None]:
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import soundfile as sf
import os

# Get your Hugging Face user access token
# Visit https://huggingface.co/settings/tokens to create one if you don't have it
# Replace 'YOUR_TOKEN' with your actual token
token = os.environ.get("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")

# Load the Whisper model for Speech-to-Text (Hindi)
whisper_processor = WhisperProcessor.from_pretrained("yash072/Whisper-small-finetuned-hindi")
whisper_model = WhisperForConditionalGeneration.from_pretrained("yash072/Whisper-small-finetuned-hindi")

# Load AI4Bharat IndicBERT model for Query Processing (Ensure model supports Hindi)
indicbert_model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert")
indicbert_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

# Load TTS model (Indic Parler)
indic_parler_tts = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indic-parler-tts")
indic_parler_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")

def speech_to_text(audio_input):
    # Convert speech to text using Whisper (Force Hindi processing)
    audio_input, _ = sf.read(audio_input)  # Read the audio file
    input_features = whisper_processor(audio_input, return_tensors="pt").input_features
    predicted_ids = whisper_model.generate(input_features)
    text = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True)
    return text

def process_query(text):
    # Ensure the input text for IndicBERT is in Hindi and the output is in Hindi
    inputs = indicbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = indicbert_model(**inputs)
    # Instead of predicting classes, we process the response text directly
    response = "यह आपकी क्वेरी का उत्तर है: " + text  # Placeholder for actual processing logic
    return response

def text_to_speech(response_text):
    # Convert the response text back to speech using Indic Parler TTS
    input_ids = indic_parler_tokenizer(response_text, return_tensors="pt").input_ids
    audio_output = indic_parler_tts.generate(input_ids)
    audio = audio_output.squeeze(0).numpy()  # Convert to numpy array
    return audio

# Full pipeline function
def multimodal_pipeline(audio_input):
    # Step 1: Convert speech to text (in Hindi)
    text_query = speech_to_text(audio_input)

    # Step 2: Process the text query to get a response (ensure response is in Hindi)
    response_text = process_query(text_query)

    # Step 3: Convert the response text to speech
    audio_response = text_to_speech(response_text)

    return audio_response

# Create a Gradio interface for the multimodal pipeline
iface = gr.Interface(
    fn=multimodal_pipeline,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(type="numpy"),
    live=True,
    title="E-commerce Query Solution",
    description="Speak a query in Hindi and get a spoken response in Hindi.",
)

# Launch the Gradio interface
iface.launch()


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: The checkpoint you are trying to load has model type `parler_tts` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.