# Translation Pipeline Demo
## Step 1: Load Environment
## Step 2: Test Translation


In [1]:
from dotenv import load_dotenv
import os

load_dotenv(r"E:\LLM_Translation_Pipeline\.env", override=True)

api_key = os.getenv("OPENAI_API_KEY")

print("Loaded key starts with:", api_key[:10])
print("Key length:", len(api_key))


Loaded key starts with: sk-proj-LX
Key length: 164


In [2]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

print(client.models.list())  # should return available models without error


SyncPage[Model](data=[Model(id='gpt-3.5-turbo', created=1677610602, object='model', owned_by='openai'), Model(id='sora-2-pro', created=1759708663, object='model', owned_by='system'), Model(id='gpt-5-pro', created=1759469822, object='model', owned_by='system'), Model(id='gpt-audio-mini', created=1759512027, object='model', owned_by='system'), Model(id='gpt-audio-mini-2025-10-06', created=1759512137, object='model', owned_by='system'), Model(id='sora-2', created=1759708615, object='model', owned_by='system'), Model(id='davinci-002', created=1692634301, object='model', owned_by='system'), Model(id='babbage-002', created=1692634615, object='model', owned_by='system'), Model(id='gpt-3.5-turbo-instruct', created=1692901427, object='model', owned_by='system'), Model(id='gpt-3.5-turbo-instruct-0914', created=1694122472, object='model', owned_by='system'), Model(id='dall-e-3', created=1698785189, object='model', owned_by='system'), Model(id='dall-e-2', created=1698798177, object='model', owned_

In [3]:
from openai import OpenAI

client = OpenAI(api_key=api_key)

# Simple test: list available models
models = client.models.list()
print("✅ API connection successful. Total models available:", len(models.data))
print("First few models:", [m.id for m in models.data[:5]])


✅ API connection successful. Total models available: 73
First few models: ['gpt-3.5-turbo', 'sora-2-pro', 'gpt-5-pro', 'gpt-audio-mini', 'gpt-audio-mini-2025-10-06']


In [4]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import pandas as pd

# Load environment variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

print("✅ Environment ready. Key loaded:", api_key[:10], "...")


✅ Environment ready. Key loaded: sk-proj-LX ...


In [5]:
def translate_text(text, target_lang):
    """Basic translation using OpenAI chat model."""
    prompt = f"Translate the following text into {target_lang}:\n\n{text}"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
    )

    return response.choices[0].message.content.strip()


In [6]:
test_text = "My name is Shivaji, Working as a CPM in Thebigword, inshort form it is called as TBW"
translated = translate_text(test_text, "French")
print("Original:", test_text)
print("French:", translated)

Original: My name is Shivaji, Working as a CPM in Thebigword, inshort form it is called as TBW
French: Mon nom est Shivaji, je travaille en tant que CPM chez Thebigword, que l'on appelle en abrégé TBW.


In [7]:
# ============================================================
# 🌐 TRANSLATION PIPELINE WITH GLOSSARY SUPPORT
# ============================================================
# Author: Shivaji Gaikwad
# Description:
#   This notebook demonstrates how to perform translations using
#   the OpenAI API with an optional glossary for term consistency.
#
# Features:
#   ✅ Loads API key from .env file
#   ✅ Translates text to a target language
#   ✅ Retrieves glossary terms and enforces translation consistency
#   ✅ Demonstrates both normal and glossary-assisted translations
# ============================================================

from openai import OpenAI
from dotenv import load_dotenv
import os
import pandas as pd

# ------------------------------
# STEP 1: Load environment and API key
# ------------------------------
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("❌ API key not found in .env file.")
else:
    print("✅ API key loaded successfully (hidden for security).")

client = OpenAI(api_key=api_key)

# ------------------------------
# STEP 2: Define helper functions
# ------------------------------

def translate_text(text: str, target_lang: str) -> str:
    """
    Translates plain text into the specified target language.
    """
    prompt = f"Translate the following text into {target_lang}:\n\n{text}"
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

def retrieve_glossary(source_text: str, glossary_df: pd.DataFrame, k: int = 5):
    """
    Retrieves relevant glossary entries based on term matches in the text.
    Returns a list of term pairs.
    """
    matches = []
    for _, row in glossary_df.iterrows():
        if row['source_term'].lower() in source_text.lower():
            matches.append(row)
    return matches[:k]

def translate_with_glossary(text: str, target_lang: str, glossary_df: pd.DataFrame):
    """
    Translates text while enforcing glossary consistency.
    """
    glossary_terms = retrieve_glossary(text, glossary_df)
    glossary_prompt = "\n".join(
        [f"{row['source_term']} → {row['target_term']}" for _, row in pd.DataFrame(glossary_terms).iterrows()]
    )
    if not glossary_prompt:
        glossary_prompt = "(no glossary matches found)"
        
    prompt = f"""
You are a professional translator. Translate the following text into {target_lang}.
Ensure these glossary terms are used consistently:
{glossary_prompt}

Text:
{text}
"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

# ------------------------------
# STEP 3: Load glossary and test translation
# ------------------------------
glossary_df = pd.read_csv("glossary.csv")
print(f"✅ Loaded glossary with {len(glossary_df)} entries.")

source_text = "My name is Shivaji, Working as a CPM in Thebigword, in short form it is called as TBW."
target_lang = "French"

print("\n--- Without Glossary ---")
plain_translation = translate_text(source_text, target_lang)
print(plain_translation)

print("\n--- With Glossary ---")
gloss_translation = translate_with_glossary(source_text, target_lang, glossary_df)
print(gloss_translation)

✅ API key loaded successfully (hidden for security).
✅ Loaded glossary with 42 entries.

--- Without Glossary ---
Mon nom est Shivaji, je travaille comme CPM chez Thebigword, que l'on appelle en abrégé TBW.

--- With Glossary ---
Mon nom est Shivaji, je travaille en tant que CPM chez THEBIGWORD, que l'on abrège en TBW.


In [8]:
import csv

results = [
    {"Mode": "Without Glossary", "Translation": plain_translation},
    {"Mode": "With Glossary", "Translation": gloss_translation},
]

with open("translation_results.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["Mode", "Translation"])
    writer.writeheader()
    writer.writerows(results)

print("✅ Results saved to translation_results.csv")


✅ Results saved to translation_results.csv


In [9]:
for lang in ["French", "Spanish", "German"]:
    print(f"\n🌍 Translating into {lang}")
    print(translate_with_glossary(source_text, lang, glossary_df))



🌍 Translating into French
Mon nom est Shivaji, je travaille en tant que CPM chez THEBIGWORD, en abrégé on l'appelle TBW.

🌍 Translating into Spanish
Mi nombre es Shivaji, trabajo como CPM en THEBIGWORD, en su forma abreviada se le llama TBW.

🌍 Translating into German
Mein Name ist Shivaji. Ich arbeite als CPM bei THEBIGWORD, kurz gesagt TBW.


In [10]:
# ====================================================
# 🧾 TRANSLATION COMPARISON TABLE
# ====================================================
import pandas as pd

# Create a summary list
summary_data = [
    {
        "Scenario": "Without Glossary",
        "Translation": plain_translation
    },
    {
        "Scenario": "With Glossary",
        "Translation": gloss_translation
    }
]

# Convert to DataFrame for better display
df_summary = pd.DataFrame(summary_data)

# Display table
print("✅ Translation Comparison Table:")
df_summary


✅ Translation Comparison Table:


Unnamed: 0,Scenario,Translation
0,Without Glossary,"Mon nom est Shivaji, je travaille comme CPM ch..."
1,With Glossary,"Mon nom est Shivaji, je travaille en tant que ..."


In [11]:
df_summary.to_csv("translation_comparison_summary.csv", index=False, encoding="utf-8")
print("💾 Saved summary as translation_comparison_summary.csv")


💾 Saved summary as translation_comparison_summary.csv


In [12]:
# ====================================================
# 🧾 TRANSLATION COMPARISON WITH DIFFERENCE HIGHLIGHTS
# ====================================================
import pandas as pd
from difflib import ndiff

def highlight_differences(base, updated):
    """Highlight the words changed between baseline and glossary-enhanced translation."""
    diff = ndiff(base.split(), updated.split())
    changes = [word for word in diff if word.startswith('+ ') or word.startswith('- ')]
    return " | ".join(changes) if changes else "No difference"

# Create summary table with differences
summary_data = [
    {
        "Scenario": "Without Glossary",
        "Translation": plain_translation
    },
    {
        "Scenario": "With Glossary",
        "Translation": gloss_translation
    },
    {
        "Scenario": "🧩 Highlighted Difference",
        "Translation": highlight_differences(plain_translation, gloss_translation)
    }
]

df_summary = pd.DataFrame(summary_data)

print("✅ Translation Comparison with Differences:")
df_summary


✅ Translation Comparison with Differences:


Unnamed: 0,Scenario,Translation
0,Without Glossary,"Mon nom est Shivaji, je travaille comme CPM ch..."
1,With Glossary,"Mon nom est Shivaji, je travaille en tant que ..."
2,🧩 Highlighted Difference,- comme | + en | + tant | + que | - Thebigword...


In [13]:
import csv

results = [
    {"Mode": "Without Glossary", "Translation": plain_translation},
    {"Mode": "With Glossary", "Translation": gloss_translation},
]

with open("translation_results.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["Mode", "Translation"])
    writer.writeheader()
    writer.writerows(results)

print("✅ Results saved to translation_results.csv")


✅ Results saved to translation_results.csv
