In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import kagglehub
from torch.utils.data import Dataset , DataLoader
import os
torch.manual_seed(42)

<torch._C.Generator at 0x7b89787097f0>

In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
path = kagglehub.dataset_download("sujalkumarsahni/fasttext-embeddings")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/sujalkumarsahni/fasttext-embeddings?dataset_version_number=1...


100%|██████████| 4.18G/4.18G [00:37<00:00, 121MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/sujalkumarsahni/fasttext-embeddings/versions/1


In [4]:
device = torch.device('cuda')

In [5]:
en_emb_np = np.load("/root/.cache/kagglehub/datasets/sujalkumarsahni/fasttext-embeddings/versions/1/en_emb_fasttext.npy" , allow_pickle = True).item()
hi_emb_np = np.load("/root/.cache/kagglehub/datasets/sujalkumarsahni/fasttext-embeddings/versions/1/hi_emb_fasttext.npy" , allow_pickle=True).item()

In [6]:
en_words = []
en_emb = []
for i in en_emb_np:
  en_words.append(i)
  en_emb.append(en_emb_np[i])

en_words = en_words[:200000]

In [7]:
hi_words = []
hi_emb = []
for i in hi_emb_np:
  hi_words.append(i)
  hi_emb.append(hi_emb_np[i])

hi_words = hi_words[:200000]

In [8]:
en_emb = torch.tensor(en_emb[:200000] , dtype=torch.float32 , device = device)
hi_emb = torch.tensor(hi_emb[:200000] , dtype = torch.float32 , device = device)

# Normalize embeddings
en_emb = en_emb / torch.norm(en_emb, dim=1, keepdim=True)
hi_emb = hi_emb / torch.norm(hi_emb, dim=1, keepdim=True)

  en_emb = torch.tensor(en_emb[:200000] , dtype=torch.float32 , device = device)


In [9]:
class Discriminator(nn.Module):

  def __init__(self , input_dim):
    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(input_dim , 2048),
        nn.LeakyReLU(0.2),
        nn.Dropout(0.4),
        nn.Linear(2048 , 1),
        nn.Sigmoid()
    )

  def forward(self , x):
    return self.model(x)

In [10]:
W = nn.Parameter(torch.randn(300,300 , device=device) )

In [18]:
model_d = Discriminator(300)
model_d = model_d.to(device)
learning_rate = 0.01
optimizer_D = optim.Adam(model_d.parameters() , lr = learning_rate)
optimizer_M = optim.Adam([W] , lr = learning_rate)
loss_function = nn.BCELoss()
beta = 0.01
epochs = 150

In [19]:
# creating Real Label
real_label = torch.ones((en_emb.shape[0], 1), device=device)
fake_label = torch.zeros((en_emb.shape[0], 1), device=device)

In [20]:
for epoch in range(epochs):
    # Discriminator step
    fake_emb = en_emb @ W.detach()
    real_pred = model_d(en_emb)
    loss_d = loss_function(real_pred, real_label)

    mapped_pred = model_d(fake_emb)
    loss_w = loss_function(mapped_pred, fake_label)

    total_loss = loss_w + loss_d
    optimizer_D.zero_grad()
    total_loss.backward()
    optimizer_D.step()

    mapped_emb = en_emb @ W
    fake_pred = model_d(mapped_emb)
    loss_m = loss_function(fake_pred , real_label)

    optimizer_M.zero_grad()
    loss_m.backward()
    optimizer_M.step()

    with torch.no_grad():
      W.copy_((1 + beta) * W - beta * W @ W.T @ W)

    print(f'epoch : {epoch+1} , Loss : {total_loss} , real_pred = {real_pred.detach().mean().item()} , fake_pred = {fake_pred.detach().mean().item()}')
    print('-'*150)

epoch : 1 , Loss : 1.383894920349121 , real_pred = 0.5081536769866943 , fake_pred = 0.3782235085964203
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch : 2 , Loss : 1.128270149230957 , real_pred = 0.5226536393165588 , fake_pred = 0.2251812368631363
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch : 3 , Loss : 0.7769675254821777 , real_pred = 0.6059057712554932 , fake_pred = 0.11131540685892105
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch : 4 , Loss : 0.459958016872406 , real_pred = 0.736073911190033 , fake_pred = 0.06173664331436157
-------------------------------------------------------------------------------------------------------------------------------------

In [21]:
import torch

def procrustes_align(X, Y):
    """Core Procrustes alignment implementation"""

    X_centered = X - X.mean(0)
    Y_centered = Y - Y.mean(0)


    U, S, Vt = torch.svd(Y_centered.T @ X_centered)
    W_ortho = U @ Vt.T

    return W_ortho

def batch_csls_sim(X, Y, k=10, batch_size=1024):
    """Memory-efficient Cross-Domain Similarity Local Scaling
    Computes similarity matrix in batches to avoid memory issues"""
    device = X.device
    n = X.shape[0]
    m = Y.shape[0]


    r_y = torch.zeros(m, device=device)
    for i in range(0, m, batch_size):
        end = min(i + batch_size, m)
        batch_Y = Y[i:end]
        sim_batch = batch_Y @ Y.T
        r_y[i:end] = torch.topk(sim_batch, min(k, m)).values.mean(1)


    indices = []
    for i in range(0, n, batch_size):
        end = min(i + batch_size, n)
        batch_X = X[i:end]


        sim_batch = batch_X @ Y.T  # Shape: [batch_size, m]


        r_x_batch = torch.topk(sim_batch, min(k, m)).values.mean(1)


        csls_batch = 2 * sim_batch - r_x_batch.unsqueeze(1) - r_y.unsqueeze(0)


        batch_indices = csls_batch.argmax(dim=1)
        indices.append(batch_indices)

    return torch.cat(indices)

def align_embeddings(en_emb, hi_emb, num_iterations=5, k=10, batch_size=1024):
    """Align source embeddings (en_emb) to target space (hi_emb)"""
    d = en_emb.shape[1]
    W = torch.eye(d, device=en_emb.device)

    for i in range(num_iterations):
        mapped_en = en_emb @ W

        nn_indices = batch_csls_sim(mapped_en, hi_emb, k=k, batch_size=batch_size)

        W_new = procrustes_align(en_emb, hi_emb[nn_indices])

        W = W_new



    return W


W = align_embeddings(en_emb, hi_emb, num_iterations=5)
mapped_en = en_emb @ W

In [22]:
import torch

def predict_hindi_translation(english_word, en_words, en_emb, hi_words, hi_emb, W, k=5):
    """
    Predicts Hindi translations for a given English word

    Parameters:
    -----------
    english_word : str
        The English word to translate
    en_words : list
        List of all English words in the vocabulary
    en_emb : torch.Tensor
        Embeddings for all English words
    hi_words : list
        List of all Hindi words in the vocabulary
    hi_emb : torch.Tensor
        Embeddings for all Hindi words
    W : torch.Tensor
        Transformation matrix from English to Hindi embedding space
    k : int
        Number of translation candidates to return

    Returns:
    --------
    translations : list
        List of (hindi_word, confidence_score) tuples
    """
    # Check if the word is in our vocabulary
    if english_word not in en_words:
        return [("Word not in vocabulary", 0.0)]

    # Get the word's index and embedding
    word_idx = en_words.index(english_word)
    word_emb = en_emb[word_idx].unsqueeze(0)  # Add batch dimension

    # Apply the transformation
    with torch.no_grad():
        mapped_emb = word_emb @ W

        # Compute similarity with all Hindi embeddings
        similarities = mapped_emb @ hi_emb.T

        # Get top-k matches
        top_k_values, top_k_indices = torch.topk(similarities.squeeze(), k=min(k, len(hi_words)))

        # Convert to list of (word, score) tuples
        translations = []
        for i in range(len(top_k_indices)):
            hi_word = hi_words[top_k_indices[i].item()]
            confidence = top_k_values[i].item()
            translations.append((hi_word, confidence))

    return translations

translations = predict_hindi_translation("hello", en_words, en_emb, hi_words, hi_emb, W)
print(f"Translations for 'hello':")
for i, (word, score) in enumerate(translations):
    print(f"{i+1}. {word} (confidence: {score:.4f})")



Translations for 'hello':
1. ब्लूज़ (confidence: 0.3063)
2. शावेज़ (confidence: 0.2731)
3. antidepressants (confidence: 0.2704)
4. दोहरेपन (confidence: 0.2635)
5. Teamsव्यवसाय (confidence: 0.2553)


In [23]:
word = "house"
translations = predict_hindi_translation(word, en_words, en_emb, hi_words, hi_emb, W, k=5)

print(f"Translations for '{word}':")
for i, (hi_word, score) in enumerate(translations):
    print(f"{i+1}. {hi_word} (confidence: {score:.4f})")

Translations for 'house':
1. पहुंचायी (confidence: 0.2394)
2. ज़ख़्मी (confidence: 0.2307)
3. UPW (confidence: 0.2179)
4. रोंग (confidence: 0.2177)
5. धधकती (confidence: 0.2114)
