In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import kagglehub
from torch.utils.data import Dataset , DataLoader
import os
torch.manual_seed(42)

<torch._C.Generator at 0x7aff8df1d6f0>

In [2]:
device = torch.device('cuda')

In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mv: cannot stat 'kaggle.json': No such file or directory


In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sujalkumarsahni/fasttext-embeddings")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/sujalkumarsahni/fasttext-embeddings/versions/1


In [5]:
en_emb_np = np.load("/root/.cache/kagglehub/datasets/sujalkumarsahni/fasttext-embeddings/versions/1/en_emb_fasttext.npy" , allow_pickle = True).item()

In [6]:
hi_emb_np = np.load("/root/.cache/kagglehub/datasets/sujalkumarsahni/fasttext-embeddings/versions/1/hi_emb_fasttext.npy" , allow_pickle=True).item()

In [7]:
en_words = []
en_emb = []
for i in en_emb_np:
  en_words.append(i)
  en_emb.append(en_emb_np[i])

en_words = en_words[:50000]

In [8]:
hi_words = []
hi_emb = []
for i in hi_emb_np:
  hi_words.append(i)
  hi_emb.append(hi_emb_np[i])

hi_words = hi_words[:50000]

In [9]:
en_emb = torch.tensor(en_emb[:50000] , dtype=torch.float32 , device = device)
hi_emb = torch.tensor(hi_emb[:50000] , dtype = torch.float32 , device = device)

# Normalize embeddings
en_emb = en_emb / torch.norm(en_emb, dim=1, keepdim=True)
hi_emb = hi_emb / torch.norm(hi_emb, dim=1, keepdim=True)

  en_emb = torch.tensor(en_emb[:50000] , dtype=torch.float32 , device = device)


In [10]:
class Discriminator(nn.Module):

  def __init__(self , input_dim):
    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(input_dim , 2048),
        nn.LeakyReLU(0.2),
        nn.Dropout(0.4),
        nn.Linear(2048 , 1),
        nn.Sigmoid()
    )

  def forward(self , x):
    return self.model(x)

In [11]:
W = nn.Parameter(torch.randn(300,300 , device=device) )

In [12]:
W

Parameter containing:
tensor([[ 0.1940,  2.1614, -0.1721,  ..., -0.5308, -1.7914, -0.2863],
        [ 0.0805, -1.3001,  1.3404,  ...,  0.3699,  0.7433, -0.1199],
        [-0.9188,  1.3789,  0.0839,  ...,  0.3610, -0.9032, -0.3624],
        ...,
        [ 0.3048,  0.0817,  0.1234,  ..., -0.7819, -1.9529, -0.5169],
        [-1.6966, -1.3123,  0.8327,  ...,  0.1246,  0.3610,  0.4237],
        [-0.4136,  0.3002, -1.2926,  ...,  0.3381,  1.1602,  2.1106]],
       device='cuda:0', requires_grad=True)

In [14]:
model_d = Discriminator(300)
model_d = model_d.to(device)
learning_rate = 0.1
optimizer_D = optim.SGD(model_d.parameters() , lr = 0.1 )
optimizer_M = optim.SGD([W] , lr = 0.1 )
loss_function = nn.BCELoss()
epochs = 150
beta = 0.01

In [15]:
# creating  Label
real_label = torch.ones((en_emb.shape[0], 1), device=device)
fake_label = torch.zeros((en_emb.shape[0], 1), device=device)

real_label = real_label - 0.1
fake_label = real_label - 0.9

In [16]:
for epoch in range(epochs):
    # Discriminator step
    fake_emb = en_emb @ W.detach()

    x = torch.cat([fake_emb , hi_emb] , 0)

    y = torch.FloatTensor(2*fake_emb.shape[0]).zero_()
    y = y.to(device)
    y[:fake_emb.shape[0]] = real_label.squeeze()
    y[fake_emb.shape[0]:] = fake_label.squeeze()
    y = y.view(-1, 1)

    preds = model_d(x)

    total_loss = loss_function(preds , y)
    optimizer_D.zero_grad()
    total_loss.backward()
    optimizer_D.step()

    mapped_emb = en_emb @ W
    src_tgt = torch.cat([mapped_emb , hi_emb] , 0)
    preds_m = model_d(src_tgt)
    loss_m = loss_function(preds_m , 1-y)

    optimizer_M.zero_grad()
    loss_m.backward()
    optimizer_M.step()

    with torch.no_grad():

      W.copy_((1 + beta) * W - beta * W.mm(W.transpose(0, 1).mm(W)))

    print(f'epoch : {epoch+1} , Discriminator loss : {total_loss} , mapper loss : {loss_m} ')
    print('-'*150)

epoch : 1 , Discriminator loss : 0.7063919901847839 , mapper loss : 1.1315871477127075 
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch : 2 , Discriminator loss : 0.5512380003929138 , mapper loss : 1.1834408044815063 
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch : 3 , Discriminator loss : 0.5369592308998108 , mapper loss : 1.2120674848556519 
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch : 4 , Discriminator loss : 0.5240294933319092 , mapper loss : 1.233568787574768 
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch : 5 , Discriminator loss : 0.5119518041

In [17]:
import torch

def procrustes_align(X, Y):
    """Core Procrustes alignment implementation"""
    # Center the embeddings
    X_centered = X - X.mean(0)
    Y_centered = Y - Y.mean(0)

    # SVD decomposition
    U, S, Vt = torch.svd(Y_centered.T @ X_centered)
    W_ortho = U @ Vt.T

    return W_ortho

def batch_csls_sim(X, Y, k=10, batch_size=1024):
    """Memory-efficient Cross-Domain Similarity Local Scaling
    Computes similarity matrix in batches to avoid memory issues"""
    device = X.device
    n = X.shape[0]
    m = Y.shape[0]

    r_y = torch.zeros(m, device=device)
    for i in range(0, m, batch_size):
        end = min(i + batch_size, m)
        batch_Y = Y[i:end]
        sim_batch = batch_Y @ Y.T
        r_y[i:end] = torch.topk(sim_batch, min(k, m)).values.mean(1)

    indices = []
    for i in range(0, n, batch_size):
        end = min(i + batch_size, n)
        batch_X = X[i:end]

        sim_batch = batch_X @ Y.T

        r_x_batch = torch.topk(sim_batch, min(k, m)).values.mean(1)

        csls_batch = 2 * sim_batch - r_x_batch.unsqueeze(1) - r_y.unsqueeze(0)

        batch_indices = csls_batch.argmax(dim=1)
        indices.append(batch_indices)

    return torch.cat(indices)

def align_embeddings(en_emb, hi_emb,W ,  num_iterations=5, k=10, batch_size=1024 ):
    """Align source embeddings (en_emb) to target space (hi_emb)"""
    # Initialize transformation matrix
    d = en_emb.shape[1]


    for i in range(num_iterations):
        mapped_en = en_emb @ W

        nn_indices = batch_csls_sim(mapped_en, hi_emb, k=k, batch_size=batch_size)

        W_new = procrustes_align(en_emb, hi_emb[nn_indices])


        W = W_new



    return W

# Example usage:
W_new = align_embeddings(en_emb, hi_emb,W ,  num_iterations=5 )
mapped_en = en_emb @ W_new

Completed iteration 5/5


In [21]:
# Example: Translate a single English word
word = "house"
translations = predict_hindi_translation(word, en_words, en_emb, hi_words, hi_emb, W, k=5)

print(f"Translations for '{word}':")
for i, (hi_word, score) in enumerate(translations):
    print(f"{i+1}. {hi_word} (confidence: {score:.4f})")

# If you want to run the interactive app
# create_translation_app(en_words, en_emb, hi_words, hi_emb, W)

Translations for 'house':
1. पहचानता (confidence: 4.1560)
2. इष्ट (confidence: 4.1004)
3. मनाया (confidence: 4.0103)
4. 1847 (confidence: 3.9866)
5. दिष्ट (confidence: 3.9625)
