#Install Libraries

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m649.6 kB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━

# Load and Process Data

In [None]:
from gensim.models import KeyedVectors

def load_top_words(path, top_k=100000):
    model = KeyedVectors.load_word2vec_format(path, limit=top_k)
    return model

In [None]:
def load_lexicon(path, src_model, tgt_model):
    pairs = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            src, tgt = line.strip().split()
            if src in src_model and tgt in tgt_model:
                pairs.append((src, tgt))
    return pairs

In [None]:
import numpy as np

def get_embedding_matrices(pairs, src_model, tgt_model):
    src_matrix = []
    tgt_matrix = []
    for src, tgt in pairs:
        src_matrix.append(src_model[src])
        tgt_matrix.append(tgt_model[tgt])
    return np.array(src_matrix).T, np.array(tgt_matrix).T

In [None]:
def procrustes(X, Y):
    # Compute optimal orthogonal mapping W
    U, _, Vt = np.linalg.svd(Y @ X.T)
    W = U @ Vt
    return W

In [None]:
def apply_mapping(src_model, W):
    mapped_vectors = {word: W @ src_model[word] for word in src_model.key_to_index}
    return mapped_vectors

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def translate_words(src_words, mapped_src_vectors, tgt_model, top_k=5):
    results = {}
    tgt_matrix = np.array([tgt_model[word] for word in tgt_model.key_to_index])
    tgt_words = list(tgt_model.key_to_index.keys())

    for word in src_words:
        if word in mapped_src_vectors:
            vec = mapped_src_vectors[word].reshape(1, -1)
            sims = cosine_similarity(vec, tgt_matrix).flatten()
            top_k_indices = np.argsort(-sims)[:top_k]
            results[word] = [tgt_words[i] for i in top_k_indices]
    return results

In [None]:
def precision_at_k(results, gold_dict, k=1):
    correct = 0
    total = 0
    for word in results:
        if word in gold_dict:
            total += 1
            correct += int(gold_dict[word] in results[word][:k])
    return correct / total

#Load Text Embeddings

In [None]:
en_model = load_top_words('/content/drive/MyDrive/cc.en.300.vec')
hi_model = load_top_words('/content/drive/MyDrive/cc.hi.300.vec')

In [None]:
bilingual_lexicon = load_lexicon('en-hi.txt', en_model, hi_model)

In [None]:
X, Y = get_embedding_matrices(bilingual_lexicon, en_model, hi_model)

#Perform Procrustes Method to find similarity

In [None]:
from scipy.spatial import procrustes

def scipy_procrustes(X, Y):
    """Aligns X to Y using Scipy's built-in Procrustes method"""
    X_aligned, Y_aligned, disparity = procrustes(X.T, Y.T)  # Transpose back to original format
    return X_aligned.T, disparity  # Return transposed back
X_procrustes, disparity = scipy_procrustes(X, Y)

In [None]:
W = procrustes(X, Y)

In [None]:
mapped_en_vectors = apply_mapping(en_model, W)

In [None]:
test_words = ["cat", "house", "river"]
translations = translate_words(test_words, mapped_en_vectors, hi_model, top_k=5)

for word, trans in translations.items():
    print(f"{word} -> {trans}")

cat -> ['बिल्ली', 'कुत्ता', 'कुत्ते', 'पालतू', 'कुत्तों']
house -> ['मकान', 'फ़्लैट', 'घर', 'कमरे', 'हाउस']
river -> ['नदी', 'किनारे', 'गंगा', 'चनाब', 'नहर']


#Results

In [None]:
gold_dict = {src: tgt for src, tgt in bilingual_lexicon}

# Compute Precision@1 and Precision@5
p1 = precision_at_k(translations, gold_dict, k=1)
p5 = precision_at_k(translations, gold_dict, k=5)

print(f"Precision@1: {p1:.4f}")
print(f"Precision@5: {p5:.4f}")

Precision@1: 0.6667
Precision@5: 1.0000


In [None]:
sizes = [5000, 10000, 20000]
for size in sizes:
    small_lexicon = bilingual_lexicon[:size]
    X_small, Y_small = get_embedding_matrices(small_lexicon, en_model, hi_model)
    W_small = procrustes(X_small, Y_small)
    mapped_small = apply_mapping(en_model, W_small)
    translations_small = translate_words(test_words, mapped_small, hi_model, top_k=5)
    p1_small = precision_at_k(translations_small, gold_dict, k=1)
    p5_small = precision_at_k(translations_small, gold_dict, k=5)
    print(f"Dictionary Size: {size}, Precision@1: {p1_small:.4f}, Precision@5: {p5_small:.4f}")


Dictionary Size: 5000, Precision@1: 0.6667, Precision@5: 1.0000
Dictionary Size: 10000, Precision@1: 0.6667, Precision@5: 0.6667
Dictionary Size: 20000, Precision@1: 0.6667, Precision@5: 1.0000
