In [None]:
`!pip install --upgrade pip
!pip install --upgrade transformers accelerate datasets[audio]
!pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git
!pip install gTTS
!pip install streamlit

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0
Collecting transformers
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl.metadata (19 kB)
Collecting datasets[audio]
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.4.1-cp310-cp310-manylinux

In [None]:
import gradio as gr
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from gtts import gTTS
import os
from gramformer import Gramformer
import torch

def initialize_model():
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "distil-whisper/distil-large-v3"

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        torch_dtype=torch_dtype,
        device=device,
    )

    return pipe

def recognize_speech(pipe, audio_file):
    result = pipe(audio_file)
    return result["text"]

def correct_grammar(text):
    gf = Gramformer(models=1, use_gpu=False)  # 1=corrector, 2=detector
    corrected_sentences = gf.correct(text, max_candidates=1)
    corrected_sentence_list = list(corrected_sentences)
    if corrected_sentence_list:
        return corrected_sentence_list[0]
    else:
        return text

import IPython.display as ipd

def feedback(audio_file):
    try:
        speech_recognition_model = initialize_model()

        recognized_text = recognize_speech(speech_recognition_model, audio_file)

        corrected_text = correct_grammar(recognized_text)

        feedback_audio_file = provide_feedback(corrected_text)

        return feedback_audio_file  # Return the path to the audio file
    except Exception as e:
        print("An error occurred:", str(e))

# Example usage: Load an audio file and call the feedback function
audio_file_path = "/content/testing.wav"  # Replace with the path to your audio file
output_audio_file = feedback(audio_file_path)

# Play the output audio file in the notebook
ipd.Audio(output_audio_file)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tokenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

[Gramformer] Grammar error correct/highlight model loaded..
[Gramformer] Grammar error correct/highlight model loaded..


In [None]:
%%writefile app.py
import streamlit as st
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from gtts import gTTS
import os
from gramformer import Gramformer
import torch

def initialize_model():
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "distil-whisper/distil-large-v3"

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        torch_dtype=torch_dtype,
        device=device,
    )

    return pipe

def recognize_speech(pipe, audio_file):
    result = pipe(audio_file)
    return result["text"]

def correct_grammar(text):
    gf = Gramformer(models=1, use_gpu=False)  # 1=corrector, 2=detector
    corrected_sentences = gf.correct(text, max_candidates=1)
    corrected_sentence_list = list(corrected_sentences)
    if corrected_sentence_list:
        return corrected_sentence_list[0]
    else:
        return text

def provide_feedback(input_text):
    corrected_text = correct_grammar(input_text)
    tts = gTTS(text=corrected_text, lang='en', slow=False)
    tts_file = "feedback_file.mp3"
    tts.save(tts_file)
    return tts_file

def main():
    st.title("Audio Feedback System")

    # Initialize the speech recognition model
    speech_recognition_model = initialize_model()

    # File upload for audio
    uploaded_file = st.file_uploader("Upload an audio file", type=["mp3", "wav"])

    if uploaded_file is not None:
        audio_file = "user_audio_file.mp3"  # Saving the uploaded file
        with open(audio_file, "wb") as f:
            f.write(uploaded_file.getvalue())

        # Recognize speech from the uploaded audio file
        recognized_text = recognize_speech(speech_recognition_model, audio_file)

        # Provide feedback on grammar
        corrected_text = correct_grammar(recognized_text)

        # Provide synthesized feedback
        feedback_audio_file = provide_feedback(corrected_text)

        # Display feedback audio
        st.audio(feedback_audio_file)

if __name__ == "__main__":
    main()



Overwriting app.py


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


[..................] \ fetchMetadata: sill resolveWithNewModule localtunnel@2.0[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.145.224.205:8501[0m
[0m
[K[?25hnpx: installed 22 in 2.349s
your url is: https://small-states-hang.loca.lt
2024-05-07 21:38:23.881857: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-07 21:38:23.881969: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-07 21:38:24.004860: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS fa

In [None]:

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "distil-whisper/distil-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
# sample = dataset[0]["audio"]
sample = '/content/output_file.mp3'
result = pipe(sample)
test = result["text"]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
!pip install gTTS

Collecting gTTS
  Downloading gTTS-2.5.1-py3-none-any.whl (29 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.5.1


In [None]:
from gtts import gTTS

# text_to_say = "This is a sample piece of text read by GTTS."
text_to_say = corrected_text
language = "en"

gtts_object = gTTS(text = text_to_say,
                  lang = language,
                  slow = False)

gtts_object.save("/content/gtts1.wav")

In [None]:
from gtts import gTTS

# text_to_say = "This is a sample piece of text read by GTTS."
text_to_say = "My mother and father lives are fantastic"
language = "en"

gtts_object = gTTS(text = text_to_say,
                  lang = language,
                  slow = False)

gtts_object.save("/content/testing2.wav")

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0
Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Collecting datasets[audio]
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_