<a href="https://colab.research.google.com/github/praj-pawar/Aligning-Cross-Lingual-Embeddings/blob/main/Cross_Lingual_Embedding_Alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1) Data Preparation

##1a - Download Pre-Trained FastText Word Embeddings

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz  # English
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz  # Hindi

--2024-09-18 17:44:57--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.15, 13.226.210.25, 13.226.210.111, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz’


2024-09-18 17:45:09 (106 MB/s) - ‘cc.en.300.vec.gz’ saved [1325960915/1325960915]

--2024-09-18 17:45:09--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.15, 13.226.210.25, 13.226.210.111, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1118942272 (1.0G) [binary/octet-stream]
Saving to: ‘cc.hi.300.vec.gz’


2024-09-18 17:45:20 (96.3 MB/s) - ‘cc.hi.300.vec.gz’ saved 

## 1b - Load the Embeddings

In [None]:
import gzip
import numpy as np

def load_fasttext_embeddings(file_path, top_n=100000):
    embeddings = {}
    with gzip.open(file_path, 'rb') as f:
        for i, line in enumerate(f):
            if i == 0:  # Skip the header line in FastText files
                continue
            if i > top_n:
                break
            tokens = line.decode('utf-8').strip().split(' ')
            word = tokens[0]
            vector = np.array(tokens[1:], dtype=np.float32)
            vector = vector / np.linalg.norm(vector)
            embeddings[word] = vector
    return embeddings

# Load English and Hindi embeddings (Top 100,000 words)
# Sorted by decreasing order of frequency by default (FastText rocks)
en_embeddings = load_fasttext_embeddings('cc.en.300.vec.gz', top_n=100000)
hi_embeddings = load_fasttext_embeddings('cc.hi.300.vec.gz', top_n=100000)

print(f"Loaded {len(en_embeddings)} English embeddings")
print(f"Loaded {len(hi_embeddings)} Hindi embeddings")


Loaded 100000 English embeddings
Loaded 100000 Hindi embeddings


##1c - Download the English-Hindi dictionary from MUSE


In [None]:
!wget https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt

--2024-09-18 17:46:25--  https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.155.173.116, 18.155.173.80, 18.155.173.40, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.155.173.116|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 930856 (909K) [text/x-c++]
Saving to: ‘en-hi.txt’


2024-09-18 17:46:25 (7.84 MB/s) - ‘en-hi.txt’ saved [930856/930856]



##1c - Display Pairs

In [None]:
def load_bilingual_lexicon(file_path):
    bilingual_dict = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            en_word, hi_word = line.strip().split()
            bilingual_dict.append((en_word, hi_word))
    return bilingual_dict

# Load English-Hindi word pairs
en_hi_pairs = load_bilingual_lexicon('en-hi.txt')

print(en_hi_pairs[:10])


[('and', 'और'), ('was', 'था'), ('was', 'थी'), ('for', 'लिये'), ('that', 'उस'), ('that', 'कि'), ('with', 'साथ'), ('from', 'से'), ('from', 'इससे'), ('this', 'ये')]


##1c - Extract Word Embeddings for Bilingual Word Pairs

In [None]:
import numpy as np

# Function to extract word embeddings for the bilingual word pairs
def extract_word_embeddings(bilingual_pairs, en_embeddings, hi_embeddings):
    en_vecs = []
    hi_vecs = []

    for en_word, hi_word in bilingual_pairs:
        if en_word in en_embeddings and hi_word in hi_embeddings:
            en_vecs.append(en_embeddings[en_word])
            hi_vecs.append(hi_embeddings[hi_word])

    # Convert lists to numpy arrays
    en_vecs = np.array(en_vecs)
    hi_vecs = np.array(hi_vecs)

    return en_vecs, hi_vecs

# Extract English and Hindi embeddings for the bilingual lexicon
en_vecs, hi_vecs = extract_word_embeddings(en_hi_pairs, en_embeddings, hi_embeddings)

print(f"Extracted {en_vecs.shape[0]} aligned word vectors.")


Extracted 18972 aligned word vectors.


#2) Embedding Alignment

##2a & 2b - Implement Orthogonal Procrustes Alignment

In [None]:
def orthogonal_procrustes(X, Y):
    """
    Perform orthogonal Procrustes alignment to learn a mapping from X to Y.

    Parameters:
    X (numpy array): Source language word embeddings (English)
    Y (numpy array): Target language word embeddings (Hindi)

    Returns:
    W (numpy array): Orthogonal transformation matrix
    """
    X = X / np.linalg.norm(X, axis=1, keepdims=True)
    Y = Y / np.linalg.norm(Y, axis=1, keepdims=True)
    # Compute matrix product of X^T and Y
    M = np.dot(X.T, Y)

    # Perform SVD on the matrix M
    U, _, Vt = np.linalg.svd(M)

    # Compute the orthogonal transformation matrix W
    W = np.dot(U, Vt)

    return W

W = orthogonal_procrustes(en_vecs, hi_vecs)

print("Orthogonal mapping matrix learned.")


Orthogonal mapping matrix learned.


In [None]:
def apply_mapping(embeddings, W):
    """
    Apply the learned orthogonal mapping to the source language embeddings.

    Parameters:
    embeddings (dict): Source language embeddings (English)
    W (numpy array): Orthogonal transformation matrix

    Returns:
    mapped_embeddings (dict): Transformed embeddings
    """
    mapped_embeddings = {}
    for word, vec in embeddings.items():
        mapped_vec = np.dot(vec, W)
        # Normalize the mapped vector
        mapped_vec = mapped_vec / np.linalg.norm(mapped_vec)
        mapped_embeddings[word] = mapped_vec
    return mapped_embeddings

aligned_en_embeddings = apply_mapping(en_embeddings, W)

print(f"Aligned {len(aligned_en_embeddings)} English embeddings into the Hindi space.")


Aligned 100000 English embeddings into the Hindi space.


#3) Evaluation


##3a - Perform Word Translation
###Execution lasted for ~2 hours before the session terminated :(
###Hence, limited size of en_words to be translated to 2000.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def translate_words(aligned_en_embeddings, hi_embeddings, top_k=5, limit_size=None):
    """
    Translate a limited number of words from English to Hindi using aligned embeddings.

    Parameters:
    aligned_en_embeddings (dict): Aligned English embeddings (after Procrustes alignment)
    hi_embeddings (dict): Hindi embeddings
    top_k (int): Number of top nearest neighbors to return
    limit_size (int, optional): Limit the number of words to translate

    Returns:
    translations (dict): Dictionary with English words as keys and top_k Hindi translations as values
    """
    translations = {}
    hi_words = list(hi_embeddings.keys())
    hi_vecs = np.array(list(hi_embeddings.values()))

    # Limit the number of words to translate if limit_size is provided
    en_words = list(aligned_en_embeddings.keys())
    if limit_size is not None:
        en_words = en_words[:limit_size]

    for en_word in en_words:
        en_vec = aligned_en_embeddings[en_word]
        # Compute cosine similarity between the English word vector and all Hindi word vectors
        en_vec = en_vec / np.linalg.norm(en_vec)
        hi_vecs_norm = hi_vecs / np.linalg.norm(hi_vecs, axis=1, keepdims=True)
        similarities = cosine_similarity([en_vec], hi_vecs_norm).flatten()

        # Get top_k most similar Hindi words
        nearest_idxs = similarities.argsort()[-top_k:][::-1]
        nearest_words = [hi_words[i] for i in nearest_idxs]

        translations[en_word] = nearest_words

    return translations

limit_size = 2000  # Set the limit size as needed
translations = translate_words(aligned_en_embeddings, hi_embeddings, top_k=5, limit_size=limit_size)

# Print a few translations
for en_word, hi_words in list(translations.items())[:10]:
    print(f"English: {en_word} -> Hindi: {hi_words}")


English: , -> Hindi: ['और', 'वे', '?', ',', 'था']
English: the -> Hindi: ['में', 'जिस', 'अपने', 'पहले', 'उसी']
English: . -> Hindi: ['¤', 'la', '…', '?', 'हर']
English: and -> Hindi: ['साथ', 'तथा', 'करती', 'लिए', 'करके']
English: to -> Hindi: ['करके', 'करें', 'करते', 'करना', 'करने']
English: of -> Hindi: ['में', 'आने', 'सबसे', 'जिसके', 'जाने']
English: a -> Hindi: ['ऐसा', 'बना', 'बड़ा', 'नया', 'अपना']
English: </s> -> Hindi: ['.', 'ik', '📝', 'QF', '▲']
English: in -> Hindi: ['में', 'सामने', 'क्षेत्र', 'बाहर', 'जहाँ']
English: is -> Hindi: ['है', 'यह', 'होता', 'करता', 'माना']


## 3a - Optimization Attempts

1.   cKDTree from scipy.spatial - builds a spatial index of Hindi word vectors. This enables efficient nearest-neighbor search operations.

2.   query method of cKDTree allows for querying the k-nearest neighbors efficiently. This avoids computing cosine similarities manually and can be faster, especially for large datasets. (Batch processing).

3. Limited Size as the above two standalone changes were
also taking more time than expected.

##Though faster, massive drop in accuracy noticed.
###(0.3722 for size=1000)
###(0.35 for size=2000)



In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np
# from scipy.spatial import cKDTree

# def translate_words_limited(aligned_en_embeddings, hi_embeddings, top_k=5, limit_size=None):
#     """
#     Translate a limited number of words from English to Hindi using aligned embeddings.

#     Parameters:
#     aligned_en_embeddings (dict): Aligned English embeddings (after Procrustes alignment)
#     hi_embeddings (dict): Hindi embeddings
#     top_k (int): Number of top nearest neighbors to return
#     limit_size (int, optional): Limit the number of words to translate

#     Returns:
#     translations (dict): Dictionary with English words as keys and top_k Hindi translations as values
#     """
#     translations = {}
#     hi_words = list(hi_embeddings.keys())
#     hi_vecs = np.array(list(hi_embeddings.values()))

#     # Build a KD-tree for the Hindi embeddings
#     hi_tree = cKDTree(hi_vecs)

#     # Limit the number of words to translate else takes up too much time
#     en_words = list(aligned_en_embeddings.keys())
#     if limit_size:
#         en_words = en_words[:limit_size]

#     for en_word in en_words:
#         en_vec = aligned_en_embeddings[en_word]
#         # Query the KD-tree to find the top_k nearest neighbors
#         distances, indices = hi_tree.query([en_vec], k=top_k)

#         # Get the nearest words based on indices
#         nearest_words = [hi_words[i] for i in indices[0]]

#         translations[en_word] = nearest_words

#     return translations


# limit_size = 2000
# translations = translate_words_limited(aligned_en_embeddings, hi_embeddings, top_k=5, limit_size=limit_size)

# # Print a few translations
# for en_word, hi_words in list(translations.items())[:10]:
#     print(f"English: {en_word} -> Hindi: {hi_words}")


English: , -> Hindi: ['और', ',', 'हैं', 'जहाँ', 'जबकि']
English: the -> Hindi: ['में', 'अपने', 'पहले', 'किसी', 'दूसरे']
English: . -> Hindi: ['across', 'इतना', 'उनका', 'desde', 'दनकौरी41']
English: and -> Hindi: ['दोनों', 'लेकिन', 'जिससे', 'जैसे', 'तथा']
English: to -> Hindi: ['करें', 'करके', 'सकें', 'करना', 'करते']
English: of -> Hindi: ['में', 'आने', 'सबसे', 'जाने', 'वाले']
English: a -> Hindi: ['बना', 'नया', 'ऐसा', 'बड़ा', 'एक']
English: </s> -> Hindi: ['.', 'ik', 'nx', 'neq', '50s1']
English: in -> Hindi: ['में', 'सामने', 'बाहर', 'उभरकर', 'जहाँ']
English: is -> Hindi: ['है', 'यह', 'जो', 'हुआ', 'होता']


##3b & 3c - Evaluate P@1, P@5, Translation Accuracy Using the MUSE Test dictionary

In [None]:
def load_muse_test_dict(file_path):
    test_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            if len(words) == 2:
                test_dict[words[0]] = words[1]
    return test_dict

In [37]:
def evaluate_translation(translations, test_dict, top_k=5):
    """
    Evaluate translation accuracy using Precision@1, Precision@5, and Accuracy metrics.

    Parameters:
    translations (dict): Dictionary with English words as keys and top_k Hindi translations as values
    test_dict (dict): Dictionary with English words as keys and correct Hindi translations as values
    top_k (int): Number of top nearest neighbors considered for Precision@k evaluation

    Returns:
    precision_at_1 (float): Precision@1 score
    precision_at_5 (float): Precision@5 score
    accuracy (float): Accuracy score
    """
    true_positives_at_1 = 0
    true_positives_at_5 = 0
    false_positives_at_1 = 0
    false_positives_at_5 = 0
    correct_predictions = 0
    total_predictions = 0

    for en_word, correct_hi_word in test_dict.items():
        predicted_hi_words = translations.get(en_word, [])

        if len(predicted_hi_words) > 0:
            total_predictions += 1

            # Precision@1
            if correct_hi_word == predicted_hi_words[0]:
                true_positives_at_1 += 1
                correct_predictions += 1  # Count for accuracy
            else:
                false_positives_at_1 += 1

            # Precision@5
            if correct_hi_word in predicted_hi_words[:top_k]:
                true_positives_at_5 += 1

                # Only count for accuracy if it hasn't been counted for Precision@1
                if correct_hi_word != predicted_hi_words[0]:
                    correct_predictions += 1
            else:
                false_positives_at_5 += 1

    # Calculate Precision@1
    precision_at_1 = true_positives_at_1 / (true_positives_at_1 + false_positives_at_1) if (true_positives_at_1 + false_positives_at_1) > 0 else 0

    # Calculate Precision@5
    precision_at_5 = true_positives_at_5 / (true_positives_at_5 + false_positives_at_5) if (true_positives_at_5 + false_positives_at_5) > 0 else 0

    # Calculate Accuracy
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

    return precision_at_1, precision_at_5, accuracy

test_dict = load_muse_test_dict('en-hi.txt')
precision_at_1, precision_at_5, accuracy = evaluate_translation(translations, test_dict)

print(f"Precision@1: {precision_at_1:.4f}")
print(f"Precision@5: {precision_at_5:.4f}")
print(f"Accuracy: {accuracy:.4f}")


Precision@1: 0.3644
Precision@5: 0.6547
Accuracy: 0.6547



##3d - Cosine Similarity Computation

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarities(en_word_pairs, en_embeddings, hi_embeddings, num_pairs=50):
    """
    Compute cosine similarities between English and Hindi word pairs.

    Parameters:
    en_word_pairs (list): List of tuples containing (English word, Hindi word) pairs
    en_embeddings (dict): Dictionary containing English word embeddings
    hi_embeddings (dict): Dictionary containing Hindi word embeddings
    num_pairs (int): Number of word pairs to compute similarities for

    Returns:
    similarities (dict): Dictionary with word pairs as keys and cosine similarity as values
    """
    similarities = {}
    count = 0

    for en_word, hi_word in en_word_pairs:
        if en_word in en_embeddings and hi_word in hi_embeddings:
            en_vec = en_embeddings[en_word]
            hi_vec = hi_embeddings[hi_word]
            en_vec = en_vec / np.linalg.norm(en_vec)
            hi_vec = hi_vec / np.linalg.norm(hi_vec)
            similarity = cosine_similarity([en_vec], [hi_vec])[0][0]
            similarities[(en_word, hi_word)] = similarity

            count += 1
            if count >= num_pairs:
                break

    return similarities

# Limit the number of word pairs to 50
cosine_similarities = compute_cosine_similarities(en_hi_pairs, en_embeddings, hi_embeddings, num_pairs=50)

# Print some cosine similarities
for (en_word, hi_word), similarity in cosine_similarities.items():
    print(f"English: {en_word}, Hindi: {hi_word}, Similarity: {similarity:.4f}")


English: and, Hindi: और, Similarity: 0.0755
English: was, Hindi: था, Similarity: -0.0464
English: was, Hindi: थी, Similarity: 0.0072
English: for, Hindi: लिये, Similarity: -0.0317
English: that, Hindi: उस, Similarity: -0.0120
English: that, Hindi: कि, Similarity: -0.0811
English: with, Hindi: साथ, Similarity: 0.0568
English: from, Hindi: से, Similarity: 0.1069
English: from, Hindi: इससे, Similarity: 0.0309
English: this, Hindi: ये, Similarity: -0.1552
English: this, Hindi: यह, Similarity: -0.1453
English: this, Hindi: इस, Similarity: -0.2058
English: his, Hindi: उसकी, Similarity: 0.0269
English: his, Hindi: उसका, Similarity: 0.0216
English: his, Hindi: उसके, Similarity: 0.0578
English: not, Hindi: नही, Similarity: 0.0132
English: not, Hindi: नहीं, Similarity: 0.0316
English: are, Hindi: हैं, Similarity: -0.0422
English: talk, Hindi: बात, Similarity: -0.0482
English: which, Hindi: जिससे, Similarity: 0.0348
English: also, Hindi: भी, Similarity: -0.0750
English: has, Hindi: रै, Similarity

##3e - Ablation Study

In [None]:
def perform_ablation_study(en_embeddings, hi_embeddings, lexicon_sizes=[5000, 10000]):
    results = {}
    all_pairs = load_bilingual_lexicon('en-hi.txt')

    for size in lexicon_sizes:
        print(f"Performing alignment with lexicon size: {size}")
        # Use a subset of the bilingual lexicon
        en_hi_pairs = all_pairs[:size]

        # Extract embeddings for the word pairs
        en_vecs, hi_vecs = extract_word_embeddings(en_hi_pairs, en_embeddings, hi_embeddings)

        # Perform Procrustes alignment
        W = orthogonal_procrustes(en_vecs, hi_vecs)

        # Apply the learned mapping to all English word embeddings
        aligned_en_embeddings = apply_mapping(en_embeddings, W)

        # Perform translation
        translations = translate_words(aligned_en_embeddings, hi_embeddings, top_k=5,limit_size=2000)

        # Evaluate translation
        test_dict = load_muse_test_dict('en-hi.txt')  # Using the same file for testing
        precision_at_1, precision_at_5, accuracy = evaluate_translation(translations, test_dict)

        results[size] = (precision_at_1, precision_at_5, accuracy)

    return results

ablation_results = perform_ablation_study(en_embeddings, hi_embeddings)

for size, (p1, p5, acc) in ablation_results.items():
    print(f"Lexicon size: {size}")
    print(f"  Precision@1: {p1:.4f}")
    print(f"  Precision@5: {p5:.4f}")
    print(f"  Accuracy: {acc:.4f}")

Performing alignment with lexicon size: 5000
Performing alignment with lexicon size: 10000
Lexicon size: 5000
  Precision@1: 0.4121
  Precision@5: 0.7208
  Accuracy: 0.7208
Lexicon size: 10000
  Precision@1: 0.3962
  Precision@5: 0.7080
  Accuracy: 0.7080
