In [None]:
!pip install gradio
!pip install SpeechRecognition

Collecting gradio
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.3 (from gradio)
  Downloading gradio_client-1.4.3-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:
import gradio as gr
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import matplotlib.pyplot as plt
from tempfile import NamedTemporaryFile
import speech_recognition as sr
from huggingface_hub import login

# Log in to Hugging Face
# login(token="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")

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("vinalal/speech-latex2")
tokenizer = T5Tokenizer.from_pretrained("vinalal/speech-latex2")
device = torch.device("cpu")
model.to(device)

# Function to convert speech to text
def recognize_speech(audio):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio) as source:
        audio_data = recognizer.record(source, duration=15)
        text = recognizer.recognize_google(audio_data)
    return text

# Convert recognized text to LaTeX
def generate_latex(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", max_length=512, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model.generate(**inputs, max_length=512, pad_token_id=tokenizer.pad_token_id)
    latex_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    latex_output = latex_output.replace("SLASH", "\\").replace("UNDERSCORE", "_").replace("CAP", "^").replace("LEFTB", "{").replace("RIGHTB", "}")
    return latex_output

def render_latex_to_image(latex_code):

    fig, ax = plt.subplots()
    ax.text(0.5, 0.5, f"${latex_code}$", fontsize=20, ha='center', va='center')
    ax.axis("off")

    with NamedTemporaryFile(delete=False, suffix=".png") as tmp_file:
        image_path = tmp_file.name
        plt.savefig(image_path, format="png", bbox_inches="tight", pad_inches=0.2)
    plt.close(fig)
    return image_path


# Combined function for Gradio interface
def process_audio(audio):
    # Recognize speech
    recognized_text = recognize_speech(audio)

    # Generate LaTeX
    latex_code = generate_latex(recognized_text)

    # Render LaTeX to an image for preview
    preview_image_path = render_latex_to_image(latex_code)

    return recognized_text, latex_code, preview_image_path

# Gradio interface setup
interface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath"),  # Use microphone as source
    outputs=[
        gr.Textbox(label="Recognized Speech"),       # Display recognized speech
        gr.Textbox(label="Generated LaTeX Code"),    # Display generated LaTeX code
        gr.Image(label="Compiled LaTeX Preview")     # Display compiled LaTeX image
    ]
)

# Launch the Gradio interface
interface.launch(debug=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://47ff82f7f7b24c3b7e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
import gradio as gr
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import matplotlib.pyplot as plt
from tempfile import NamedTemporaryFile
import speech_recognition as sr
from huggingface_hub import login
import os
import io
from PIL import Image
import numpy as np

# Initialize model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("vinalal/speech-latex2")
tokenizer = T5Tokenizer.from_pretrained("vinalal/speech-latex2")
device = torch.device("cpu")
model.to(device)

def latex_to_image(latex_expr, output_dir="latex_images"):
    """
    Convert LaTeX expression to image and save it with better quality
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Create figure with white background
    fig = plt.figure(figsize=(12, 2))
    fig.patch.set_facecolor('white')

    # Remove axes
    ax = plt.axes([0, 0, 1, 1])
    ax.set_axis_off()

    # Render LaTeX with larger fontsize and better resolution
    plt.text(0.5, 0.5, f"${latex_expr}$",
             horizontalalignment='center',
             verticalalignment='center',
             fontsize=20)

    # Save to a bytes buffer with higher DPI
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight',
                pad_inches=0.2, transparent=False, dpi=300)
    plt.close()

    # Open image from buffer
    buf.seek(0)
    img = Image.open(buf)

    # Create unique filename
    import hashlib
    filename = hashlib.md5(latex_expr.encode()).hexdigest()[:10] + '.png'
    filepath = os.path.join(output_dir, filename)

    # Save image
    img.save(filepath, format='PNG', quality=95)
    buf.close()

    return filepath

def recognize_speech(audio):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio) as source:
        audio_data = recognizer.record(source, duration=15)
        text = recognizer.recognize_google(audio_data)
    return text

def generate_latex(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", max_length=512, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model.generate(**inputs, max_length=512, pad_token_id=tokenizer.pad_token_id)
    latex_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    latex_output = latex_output.replace("SLASH", "\\").replace("UNDERSCORE", "_").replace("CAP", "^").replace("LEFTB", "{").replace("RIGHTB", "}")
    return latex_output

def process_audio(audio):
    try:
        # Recognize speech
        recognized_text = recognize_speech(audio)

        # Generate LaTeX
        latex_code = generate_latex(recognized_text)

        # Generate high-quality image
        preview_image_path = latex_to_image(latex_code)

        return recognized_text, latex_code, preview_image_path
    except Exception as e:
        return str(e), "", None

# Create and launch the Gradio interface
interface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
    outputs=[
        gr.Textbox(label="Recognized Speech"),
        gr.Textbox(label="Generated LaTeX Code"),
        gr.Image(label="LaTeX Preview")
    ],
    title="Speech to LaTeX Converter",
    description="Speak or upload audio to convert mathematical expressions to LaTeX with preview",
    examples=[],  # You can add example audio files here
    cache_examples=True
)

if __name__ == "__main__":
    interface.launch(debug=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://b8ac88299b082d5dfd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://b8ac88299b082d5dfd.gradio.live


In [None]:
from transformers import T5Tokenizer

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("vinalal/speech-latex3")

# Input string
input_text = "SLASHsqrtLEFTB 25 RIGHTB"

# Tokenization
tokens = tokenizer(input_text, return_tensors="pt")
input_ids = tokens["input_ids"][0]

# Convert IDs to tokens
actual_tokens = tokenizer.convert_ids_to_tokens(input_ids)

# Display the results
print("Tokenized Input (IDs):", input_ids.tolist())
print("Actual Tokens:", actual_tokens)

# Detokenization
decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)
print("\nDetokenized Output:", decoded_text)


Tokenized Input (IDs): [3, 5629, 21337, 7, 1824, 52, 17, 3765, 6245, 279, 944, 3, 27262, 279, 1]
Actual Tokens: ['▁', 'SL', 'ASH', 's', 'q', 'r', 't', 'LE', 'FT', 'B', '▁25', '▁', 'RIGHT', 'B', '</s>']

Detokenized Output: SLASHsqrtLEFTB 25 RIGHTB
