In [None]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
cd /content/drive/MyDrive/AI Alignment

/content/drive/MyDrive/AI Alignment


In [None]:
!pip install -q python-docx lxml sentence-transformers opencc-python-reimplemented googletrans==4.0.0rc1

In [None]:
import os
import zipfile
import shutil
from docx import Document
from lxml import etree
from sentence_transformers import SentenceTransformer, util
from opencc import OpenCC
from googletrans import Translator
from google.colab import files

In [None]:
# Load improved multilingual sentence alignment model and tools
from sentence_transformers import SentenceTransformer
from googletrans import Translator
from opencc import OpenCC

# Use a stronger multilingual sentence transformer
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
translator = Translator()
converter = OpenCC('s2t')  # Converts simplified Chinese to traditional

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Extract plain text paragraphs
def extract_text_from_docx(docx_file):
    doc = Document(docx_file)
    return [p.text.strip() for p in doc.paragraphs if p.text.strip()]

In [None]:
# Extract <w:ins> and <w:del> tracked changes + translate
def extract_tracked_changes(docx_file):
    unzip_dir = "eng_unzip"
    # Use ignore_errors=True for graceful handling if the directory doesn't exist
    shutil.rmtree(unzip_dir, ignore_errors=True)
    with zipfile.ZipFile(docx_file, 'r') as zip_ref:
        zip_ref.extractall(unzip_dir)

    xml_path = os.path.join(unzip_dir, 'word/document.xml')
    tree = etree.parse(xml_path)
    ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

    changes = []
    for ins in tree.findall(".//w:ins", ns):
        # Ensure only non-empty text is considered from w:t elements
        text_parts = [t.text for t in ins.findall(".//w:t", ns) if t.text and t.text.strip()]
        text = ''.join(text_parts)

        if text and text.strip(): # Only attempt translation if text is not empty or just whitespace
            try:
                translated_result = translator.translate(text, src='en', dest='zh-cn')
                zh = translated_result.text if translated_result and translated_result.text else None
                if zh:
                    changes.append(("insert", zh, text))
                else:
                    print(f"Warning: Translation failed for insert text: '{text}'")
            except Exception as e:
                print(f"Error translating insert text '{text}': {e}")


    for d in tree.findall(".//w:del", ns):
        # Ensure only non-empty text is considered from w:delText elements
        text_parts = [t.text for t in d.findall(".//w:delText", ns) if t.text and t.text.strip()]
        text = ''.join(text_parts)

        if text and text.strip(): # Only attempt translation if text is not empty or just whitespace
            try:
                translated_result = translator.translate(text, src='en', dest='zh-cn')
                zh = translated_result.text if translated_result and translated_result.text else None
                if zh:
                    changes.append(("delete", zh, text))
                else:
                    print(f"Warning: Translation failed for delete text: '{text}'")
            except Exception as e:
                 print(f"Error translating delete text '{text}': {e}")


    # Clean up the unzipped directory
    shutil.rmtree(unzip_dir, ignore_errors=True)

    return changes

In [None]:
# Improved sentence alignment logic
def align_changes(changes, eng_paras, ch_paras):
    aligned_changes = []
    matched = 0

    # Encode all Chinese paragraphs once
    ch_embeddings = model.encode(ch_paras, convert_to_tensor=True)

    for change_type, zh_text, eng_text in changes:
        # Encode the English change
        eng_embedding = model.encode(eng_text, convert_to_tensor=True)
        sim_scores = util.cos_sim(eng_embedding, ch_embeddings)[0]

        # Top 3 candidates
        top_indices = sim_scores.argsort(descending=True)[:3]

        best_match = None
        best_score = 0.0

        for idx in top_indices:
            idx = int(idx)
            candidate = ch_paras[idx]
            score = float(sim_scores[idx])

            # Accept match if:
            # - cosine similarity >= 0.55 OR
            # - fuzzy text overlap found
            if score >= 0.55 or zh_text[:6] in candidate or zh_text in candidate:
                best_match = candidate
                best_score = score
                break

        if best_match:
            matched += 1
            aligned_changes.append((change_type, zh_text, best_match, best_score))
        else:
            aligned_changes.append((change_type, zh_text, "", 0.0))

    accuracy = matched / len(changes) if changes else 0
    return aligned_changes, accuracy


In [None]:
# Wrap into Track Changes XML tags
def wrap_change_xml(change_type, text):
    ns = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
    tag = 'ins' if change_type == "insert" else 'del'
    elem = etree.Element(f"{{{ns}}}{tag}")
    t = etree.SubElement(elem, f"{{{ns}}}t" if tag == "ins" else f"{{{ns}}}delText")
    t.text = text
    return elem

In [None]:
# Apply tracked changes to the Chinese DOCX and save output
def apply_changes_to_chinese_docx(ch_docx_path, aligned_changes):
    unzip_dir = "ch_unzip"
    shutil.rmtree(unzip_dir, ignore_errors=True)
    with zipfile.ZipFile(ch_docx_path, 'r') as zip_ref:
        zip_ref.extractall(unzip_dir)

    xml_path = os.path.join(unzip_dir, 'word/document.xml')
    tree = etree.parse(xml_path)
    ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

    for para in tree.findall('.//w:p', ns):
        for t in para.findall('.//w:t', ns):
            if t.text:
                for change_type, zh_text, ch_match, score in aligned_changes:
                    if score > 0.6 and ch_match in t.text:
                        tag = wrap_change_xml(change_type, zh_text)
                        # Get the parent before removing the child
                        parent = t.getparent()
                        # Remove the original text element
                        parent.remove(t)
                        # Append the new tag to the parent
                        parent.append(tag)
                        break

    tree.write(xml_path, xml_declaration=True, encoding='utf-8')

    output_docx = "Chinese_Tracked_Output.docx"
    with zipfile.ZipFile(output_docx, 'w', zipfile.ZIP_DEFLATED) as docx:
        for folder, _, files_in in os.walk(unzip_dir):
            for f in files_in:
                full_path = os.path.join(folder, f)
                arcname = os.path.relpath(full_path, unzip_dir)
                docx.write(full_path, arcname)

    return output_docx

In [None]:
#Define file names
english_file = "[Track Changes] KFS - AXA Global Strategic Bonds_E.docx"
chinese_file = "KFS - AXA Global Strategic Bonds_C 8.53.14 AM.docx"

In [None]:
# Step 2: Extract paragraphs from both
eng_paragraphs = extract_text_from_docx(english_file)
ch_paragraphs = extract_text_from_docx(chinese_file)
ch_paragraphs = [converter.convert(t) for t in ch_paragraphs]

In [None]:
# Step 3: Extract and align changes
tracked_changes = extract_tracked_changes(english_file)
aligned, accuracy = align_changes(tracked_changes, eng_paragraphs, ch_paragraphs)

In [None]:
# Step 4: Apply changes and save output
output_file = apply_changes_to_chinese_docx(chinese_file, aligned)

In [None]:
# Print results
print(f"✅ Changes applied and saved to: {output_docx}")
print(f"🎯 Alignment Accuracy: {alignment_accuracy * 100:.2f}%")

✅ Changes applied and saved to: Chinese_Tracked_Output.docx
🎯 Alignment Accuracy: 32.50%


In [None]:
# Download the final output with track changes in Chinese
files.download("Chinese_Tracked_Output.docx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
# Install necessary packages
!pip install -q sentence-transformers opencc-python-reimplemented googletrans==4.0.0rc1

# Imports
from sentence_transformers import SentenceTransformer
from opencc import OpenCC
from googletrans import Translator
import difflib

# Load tools
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
translator = Translator()
converter = OpenCC('s2t')  # Simplified to Traditional Chinese

# Simulated English tracked changes
tracked_changes = [
    ("delete", translator.translate("stable income", src='en', dest='zh-cn').text, "stable income"),
    ("insert", translator.translate("consistent income", src='en', dest='zh-cn').text, "consistent income")
]

# Simulated original Chinese text
ch_paragraphs = ["该基金旨在在长期内提供稳定的收益。"]  # This matches the old version (pre-edit)
matched_output = []

# Match and apply logic
for change_type, zh_text, original_eng in tracked_changes:
    best_match = ""
    best_score = 0.0

    for para in ch_paragraphs:
        ratio = difflib.SequenceMatcher(None, zh_text, para).ratio()
        if ratio > best_score:
            best_score = ratio
            best_match = para

    # Apply inline simulated track changes
    if best_score > 0.6:
        modified = best_match.replace(zh_text, f"[{'DEL' if change_type=='delete' else 'INS'}:{zh_text}]")
    else:
        modified = f"(No match found for: {zh_text})"
    matched_output.append(modified)

# Display output
print("🔍 Original Chinese Paragraph:")
print(ch_paragraphs[0])
print("\n📌 After Applying Tracked Changes:")
for line in matched_output:
    print(line)


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔍 Original Chinese Paragraph:
该基金旨在在长期内提供稳定的收益。

📌 After Applying Tracked Changes:
(No match found for: 稳定收入)
(No match found for: 一致的收入)
