In [1]:
pip install gradio


Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.2 (from gradio)
  Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [2]:
pip install transformers gradio pypdf2


Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [3]:
# Import necessary libraries
from transformers import MarianMTModel, MarianTokenizer
import gradio as gr
from PyPDF2 import PdfReader
from sklearn.metrics import accuracy_score
import difflib
import re

# Define the model name
model_name = 'Helsinki-NLP/opus-mt-en-hi'

# Load the tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def clean_text(text):
    """Clean extracted text by removing extra spaces and newlines."""
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def translate(text):
    """Translate input text using the MarianMT model."""
    # Tokenize and split into chunks if needed
    tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors='pt', truncation=True)

    # Perform the translation
    translation = model.generate(**tokenized_text)

    # Decode the translated text
    translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)
    return translated_text

def translate_pdf(pdf_file):
    """
    Function to extract text from a PDF file and translate it.
    Displays both original and translated content.
    Calculates a basic accuracy score based on sequence matching.
    """
    # Initialize an empty string to store PDF text
    pdf_text = ""

    # Read the PDF file and extract text
    reader = PdfReader(pdf_file)
    for page in reader.pages:
        extracted_text = page.extract_text()
        if extracted_text:
            pdf_text += extracted_text + "\n"

    if not pdf_text.strip():
        return "Error: No text extracted from PDF. It may be scanned or empty.", "", ""

    # Clean the extracted text
    pdf_text = clean_text(pdf_text)

    # Translate the text
    translated_text = translate(pdf_text)

    # Basic accuracy calculation using sequence matcher
    accuracy = difflib.SequenceMatcher(None, pdf_text, translated_text).ratio() * 100

    return pdf_text, translated_text, f"Accuracy Score: {accuracy:.2f}%"

def normal_text_translation(text):
    """Translate normal text input and calculate accuracy."""
    if not text.strip():
        return "Please enter text to translate.", "", ""
    translated_text = translate(text)
    accuracy = difflib.SequenceMatcher(None, text, translated_text).ratio() * 100
    return text, translated_text, f"Accuracy Score: {accuracy:.2f}%"

def file_translation(pdf_file):
    """Handle PDF file translation."""
    if pdf_file:
        return translate_pdf(pdf_file)
    return "", "", "Please upload a valid file."

# Create a Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# English to Hindi Translator")
    gr.Markdown("Choose between normal text translation or file upload method. Each provides its own accuracy score.")

    with gr.Tab("Normal Text Translation"):
        text_input = gr.Textbox(label="Enter text to translate")
        text_submit = gr.Button("Submit")
        text_output = gr.Textbox(label="Original Content")
        text_translated = gr.Textbox(label="Translated Content")
        text_accuracy = gr.Textbox(label="Accuracy Score")

        text_submit.click(
            fn=normal_text_translation,
            inputs=text_input,
            outputs=[text_output, text_translated, text_accuracy]
        )

    with gr.Tab("File Upload Translation"):
        file_input = gr.File(label="Upload a PDF file for translation", type='filepath')
        file_submit = gr.Button("Translate PDF")
        file_output = gr.Textbox(label="Original Content")
        file_translated = gr.Textbox(label="Translated Content")
        file_accuracy = gr.Textbox(label="Accuracy Score")

        file_submit.click(
            fn=file_translation,
            inputs=file_input,
            outputs=[file_output, file_translated, file_accuracy]
        )

# Launch the Gradio interface
iface.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0475d489526e312a1b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


