In [1]:
import os
import getpass
import json
from langchain_openai import OpenAI
from langchain.schema import HumanMessage
from langchain_openai import ChatOpenAI
import re

os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [2]:
from langchain.prompts import PromptTemplate

single_translation_prompt = PromptTemplate(
    input_variables=["text", "dest_language"],
    template="""
You are an expert translator. Your task is to translate a given sentence into the specified target language. 

### Instructions:
1. If the input sentence is already in the destination language, return it as is.
2. Otherwise, accurately translate the sentence into the destination language.

### Input:
- Text: {text}
- Target Language: {dest_language}

### Output:
"Translated Sentence or Original Text if already in the target language."

### Example:
Input:
- Text: "Hello"
- Target Language: "vi"
Output:
"Xin chào"

Make sure to evaluate whether the text is already in the target language before translating.
"""
)

In [3]:
multi_translation_prompt = PromptTemplate(
    input_variables=["texts", "dest_language"],
    template="""
You are a professional translator. Your task is to translate a list of sentences into the specified target language.

### Instructions:
1. Analyze each sentence in the list individually.
2. Retain sentences that are already in the destination language as is.
3. Translate sentences that are not in the destination language.
4. **Return output as a plain JSON list without any formatting codes, comments, or explanations.**

### Input:
- Sentences: {texts}
- Target Language: {dest_language}

### Output Format:
["Translated Sentence 1", "Translated Sentence 2", ...]

### Example:
Input:
- Sentences: ["Hello", "I am Peter", "Tôi là sinh viên"]
- Target Language: "vi"
Output:
["Xin chào", "Tôi là Peter", "Tôi là sinh viên"]
"""
)

In [4]:
# Initialize OpenAI Model (gpt-4o-mini)
model = ChatOpenAI(model="gpt-4o", temperature=0.7)
# Output File
output_file = "generated_responses.json"

In [5]:
# Single Sentence Input Examples
single_inputs = [
    {"text": "Hello", "dest_language": "vi"},
    {"text": "Xin chào", "dest_language": "vi"},
    {"text": "Good morning", "dest_language": "vi"},
    {"text": "Chào buổi sáng", "dest_language": "vi"},
    {"text": "Thank you", "dest_language": "vi"}
]

In [6]:
# Multiple Sentences Input Examples
multi_inputs = [
    {"texts": ["Hello", "I am Peter"], "dest_language": "vi"},
    {"texts": ["Xin chào", "Tôi là sinh viên"], "dest_language": "vi"},
    {"texts": ["Good evening", "How are you?",
               "Tôi đang học tiếng Anh"], "dest_language": "vi"},
    {"texts": ["What is your name?", "Tên tôi là An"], "dest_language": "vi"},
    {"texts": ["Thank you", "Welcome"], "dest_language": "vi"}
]

In [7]:
import re


def clean_response(text):
    # Step 1: Remove '== AI Message ==', '===' or other special headers
    text = re.sub(r"=+.*?=+\n", "", text)

    # Step 2: Remove bold markers '**'
    text = re.sub(r"\*\*", "", text)

    # Step 1: Remove LaTeX-style inline math delimiters and escape characters
    text = re.sub(r"\\\[|\\\]", "", text)  # Remove \[ and \]
    # Remove \text{} and keep content
    text = re.sub(r"\\text\{(.*?)\}", r"\1", text)

    # Step 2: Remove unnecessary newlines and excessive whitespace
    text = re.sub(
        r"\n{2,}", "\n", text
    )  # Replace multiple newlines with a single newline
    # Replace multiple spaces with a single space
    text = re.sub(r"\s{2,}", " ", text)

    # Step 3: Remove unnecessary inline backslashes or escape characters
    text = re.sub(r"\\\\", "", text)  # Remove unnecessary backslashes

    # Step 4: Remove redundant phrases (optional cleanup for clarity)
    # Replace LaTeX-style "times" with "x"
    text = re.sub(r"\\times", "x", text)
    # Remove bold markers if any
    text = re.sub(r"\\textbf\{(.*?)\}", r"\1", text)

    # Step 5: Clean up leading/trailing whitespace
    text = text.strip()

    return text

In [8]:
import json
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage


def invoke_and_save_translation(queries, model, output_file, single_prompt, multi_prompt):
    """
    Invokes the OpenAI model for translation tasks and saves the results in JSON format.

    Args:
        queries (list): A list of queries with input text and destination language.
        model (ChatOpenAI): Language model instance.
        output_file (str): Path to save the output file.
        single_prompt (PromptTemplate): Template for single text translation.
        multi_prompt (PromptTemplate): Template for multiple texts translation.
    """
    results = []  # List to store responses

    for query in queries:
        # Check if the input is a single sentence or a list of sentences
        if "text" in query:  # Single sentence case
            prompt = single_prompt.format(
                text=query["text"],
                dest_language=query["dest_language"]
            )
        elif "texts" in query:  # Multiple sentences case
            prompt = multi_prompt.format(
                texts=query["texts"],
                dest_language=query["dest_language"]
            )
        else:
            raise ValueError(
                "Invalid query format! Must contain 'text' or 'texts'.")

        # Invoke GPT model
        response = model.invoke(prompt)
        # Append the results
        results.append({
            "input": query,
            "response":  clean_response(response.pretty_repr())
        })
        print(f"Completed: {query}")

    # Save results to JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
    print(f"Responses saved to {output_file}.")

In [9]:
invoke_and_save_translation(
    single_inputs, model, "translations_output.json", single_translation_prompt, multi_translation_prompt)

Completed: {'text': 'Hello', 'dest_language': 'vi'}
Completed: {'text': 'Xin chào', 'dest_language': 'vi'}
Completed: {'text': 'Good morning', 'dest_language': 'vi'}
Completed: {'text': 'Chào buổi sáng', 'dest_language': 'vi'}
Completed: {'text': 'Thank you', 'dest_language': 'vi'}
Responses saved to translations_output.json.


In [10]:
invoke_and_save_translation(multi_inputs, model, "translations_output_multi.json",
                            single_translation_prompt, multi_translation_prompt)

Completed: {'texts': ['Hello', 'I am Peter'], 'dest_language': 'vi'}
Completed: {'texts': ['Xin chào', 'Tôi là sinh viên'], 'dest_language': 'vi'}
Completed: {'texts': ['Good evening', 'How are you?', 'Tôi đang học tiếng Anh'], 'dest_language': 'vi'}
Completed: {'texts': ['What is your name?', 'Tên tôi là An'], 'dest_language': 'vi'}
Completed: {'texts': ['Thank you', 'Welcome'], 'dest_language': 'vi'}
Responses saved to translations_output_multi.json.
