<a href="https://colab.research.google.com/github/sadia-aly/LSTM/blob/main/Final_Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("muhammadanasmahmood/bible-dataset-with-english-to-urdu-translation")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/muhammadanasmahmood/bible-dataset-with-english-to-urdu-translation/versions/1


In [57]:
import os

filepath = os.path.join(path, 'bible.csv')
ds = pd.read_csv(filepath, header=None); ds.columns = ['English', 'Urdu']
print(f" Loaded {len(ds)} Bible translation pairs")

‚úÖ Loaded 7957 Bible translation pairs


Preprocess

In [58]:

def preprocess(s):
    s = str(s).lower(); s = re.sub(r'[?.!,;]', ' ', s)
    s = re.sub(r'[^a-zA-Z0-9\u0600-\u06FF\s]', ' ', s); return s.strip()

ds['English_clean'] = ds['English'].apply(preprocess)
ds['Urdu_clean'] = ds['Urdu'].apply(preprocess)
input_texts, target_texts = ds['English_clean'].tolist(), ["<sos> " + t + " <eos>" for t in ds['Urdu_clean']]


In [59]:

def preprocess(s):
    s = str(s).lower(); s = re.sub(r'[?.!,;]', ' ', s)
    s = re.sub(r'[^a-zA-Z0-9\u0600-\u06FF\s]', ' ', s); return s.strip()

ds['English_clean'] = ds['English'].apply(preprocess)
ds['Urdu_clean'] = ds['Urdu'].apply(preprocess)
input_texts, target_texts = ds['English_clean'].tolist(), ["<sos> " + t + " <eos>" for t in ds['Urdu_clean']]


Vocab & Tokenization

In [60]:

def build_vocab(texts):
    vocab = {'<pad>':0, '<unk>':1, '<sos>':2, '<eos>':3}; idx = 4
    word_to_idx = vocab.copy()
    for text in texts:
        for w in text.split():
            if w not in word_to_idx: word_to_idx[w] = idx; idx += 1
    return word_to_idx, {v:k for k,v in word_to_idx.items()}

src_vocab, src_idx2word = build_vocab(input_texts)
tgt_vocab, tgt_idx2word = build_vocab(target_texts)
src_vocab_size, tgt_vocab_size = len(src_vocab), len(tgt_vocab)

def texts_to_seqs(texts, vocab):
    return [[vocab.get(w, 1) for w in text.split()] for text in texts]

encoder_inputs = texts_to_seqs(input_texts, src_vocab)
decoder_inputs = texts_to_seqs(target_texts, tgt_vocab)
decoder_targets = [seq[1:] + [3] for seq in decoder_inputs]
max_enc_len, max_dec_len = max(map(len, encoder_inputs)), max(map(len, decoder_inputs))
print(f" Vocab: {src_vocab_size}‚Üí{tgt_vocab_size}, Max len: {max_enc_len}‚Üí{max_dec_len}")


‚úÖ Vocab: 5962‚Üí9108, Max len: 68‚Üí86


Padding

In [61]:

def pad_seqs(seqs, maxlen, pad=0):
    padded = np.full((len(seqs), maxlen), pad, dtype=np.int32)
    for i, seq in enumerate(seqs): padded[i, :len(seq)] = seq
    return padded

p_enc = pad_seqs(encoder_inputs, max_enc_len)
p_dec_in = pad_seqs(decoder_inputs, max_dec_len)
p_dec_tgt = pad_seqs(decoder_targets, max_dec_len)


Dataset

In [62]:

class TransDataset(Dataset):
    def __init__(self, enc, dec_in, dec_tgt):
        self.enc = torch.LongTensor(enc); self.dec_in = torch.LongTensor(dec_in)
        self.dec_tgt = torch.LongTensor(dec_tgt)
    def __len__(self): return len(self.enc)
    def __getitem__(self, idx):
        return {'enc':self.enc[idx], 'dec_in':self.dec_in[idx], 'dec_tgt':self.dec_tgt[idx]}

dataset = TransDataset(p_enc, p_dec_in, p_dec_tgt)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


MultiHeadAttention

In [63]:

class MultiHeadAttn(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.1):
        super().__init__(); self.d_model, self.nhead, self.dk = d_model, nhead, d_model//nhead
        self.wq, self.wk, self.wv, self.wo = [nn.Linear(d_model, d_model) for _ in range(4)]
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key=None, value=None, mask=None):
        if key is None: key = query
        if value is None: value = query
        Q, K, V = self.wq(query), self.wk(key), self.wv(value)
        Q, K, V = map(lambda x: x.view(x.size(0), -1, self.nhead, self.dk).transpose(1,2), [Q,K,V])

        scores = torch.matmul(Q, K.transpose(-2,-1)) / np.sqrt(self.dk)
        if mask is not None: scores.masked_fill_(mask==0, -1e9)
        attn = F.softmax(scores, dim=-1); attn = self.dropout(attn)
        attn = torch.matmul(attn, V).transpose(1,2).contiguous().view(attn.size(0), -1, self.d_model)
        return self.wo(attn)

In [64]:

class EncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dff, drop=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttn(d_model, nhead, drop)
        self.ff = nn.Sequential(nn.Linear(d_model, dff), nn.ReLU(), nn.Dropout(drop), nn.Linear(dff, d_model))
        self.norm1, self.norm2 = nn.LayerNorm(d_model), nn.LayerNorm(d_model)
        self.drop = nn.Dropout(drop)

    def forward(self, x, mask=None):
        attn = self.self_attn(x, mask=mask); x = self.norm1(x + self.drop(attn))
        ff = self.ff(x); return self.norm2(x + self.drop(ff))

class Encoder(nn.Module):
    def __init__(self, d_model, nhead, nlayers, dff, max_len=100):
        super().__init__()
        self.layers = nn.ModuleList([EncoderLayer(d_model, nhead, dff) for _ in range(nlayers)])
        self.pos_enc = nn.Parameter(torch.zeros(1, max_len, d_model))

    def forward(self, x, mask=None):
        x = x + self.pos_enc[:, :x.size(1)]
        for layer in self.layers: x = layer(x, mask)
        return x


Decoder

In [65]:

class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dff, drop=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttn(d_model, nhead, drop)
        self.cross_attn = MultiHeadAttn(d_model, nhead, drop)
        self.ff = nn.Sequential(nn.Linear(d_model, dff), nn.ReLU(), nn.Dropout(drop), nn.Linear(dff, d_model))
        self.norms = nn.ModuleList([nn.LayerNorm(d_model) for _ in range(3)])
        self.drop = nn.Dropout(drop)

    def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
        self_out = self.self_attn(x, mask=tgt_mask); x = self.norms[0](x + self.drop(self_out))
        cross_out = self.cross_attn(x, enc_out, enc_out, mask=src_mask); x = self.norms[1](x + self.drop(cross_out))
        ff_out = self.ff(x); return self.norms[2](x + self.drop(ff_out))

class Decoder(nn.Module):
    def __init__(self, d_model, nhead, nlayers, dff, tgt_vocab_size, max_len=100):
        super().__init__()
        self.embed = nn.Embedding(tgt_vocab_size, d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, nhead, dff) for _ in range(nlayers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.pos_enc = nn.Parameter(torch.zeros(1, max_len, d_model))

    def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
        x = self.embed(x) + self.pos_enc[:, :x.size(1)]
        for layer in self.layers: x = layer(x, enc_out, src_mask, tgt_mask)
        return self.fc_out(x)


Transformer & Init

In [66]:

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=8,
                 n_enc_layers=4, n_dec_layers=4, dff=1024, max_len=100):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab_size, d_model)
        self.encoder = Encoder(d_model, nhead, n_enc_layers, dff, max_len)
        self.decoder = Decoder(d_model, nhead, n_dec_layers, dff, tgt_vocab_size, max_len)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src_emb = self.src_embed(src)
        enc_out = self.encoder(src_emb, src_mask)
        return self.decoder(tgt, enc_out, src_mask, tgt_mask)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(src_vocab_size, tgt_vocab_size).to(device)
print(f" Model on {device}: {sum(p.numel() for p in model.parameters()):,}")


‚úÖ Model on cuda: 13,622,676


 Masks & Train Data

In [67]:

def create_masks(src, tgt):
    src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
    seq_len = tgt.size(1); tgt_mask = torch.tril(torch.ones((seq_len, seq_len), device=device)).bool()
    tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(0) & (tgt != 0).unsqueeze(1).unsqueeze(3)
    return src_mask, tgt_mask

train_ds = TransDataset(p_enc, p_dec_in, p_dec_tgt)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)


In [68]:

def train_epoch(model, loader, opt, crit):
    model.train(); total_loss = 0
    for batch in loader:
        src, tgt_in, tgt_out = batch['enc'].to(device), batch['dec_in'][:,:-1].to(device), batch['dec_tgt'][:,1:].to(device)
        src_mask, tgt_mask = create_masks(src, tgt_in)
        opt.zero_grad(); out = model(src, tgt_in, src_mask, tgt_mask)
        loss = crit(out.reshape(-1, out.size(-1)), tgt_out.reshape(-1))
        loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0); opt.step()
        total_loss += loss.item()
    return total_loss / len(loader)

opt = torch.optim.Adam(model.parameters(), lr=1e-4)
crit = nn.CrossEntropyLoss(ignore_index=0)
print("üöÄ Training...")
for epoch in range(3):
    loss = train_epoch(model, train_loader, opt, crit)
    print(f"Epoch {epoch+1}/3, Loss: {loss:.4f}")
torch.save(model.state_dict(), 'transformer_enur.pth'); print(" Saved!")

üöÄ Training...
Epoch 1/3, Loss: 6.0432
Epoch 2/3, Loss: 5.3968
Epoch 3/3, Loss: 5.1568
‚úÖ Saved!


In [69]:

model.load_state_dict(torch.load('transformer_enur.pth', map_location=device)); model.eval()

FALLBACKS = {"hi":"ÿ≥ŸÑÿßŸÖ", "hello":"€Å€åŸÑŸà", "hey":"€Å€åŸÑŸà", "good morning":"ÿµÿ®ÿ≠ ÿ®ÿÆ€åÿ±",
             "thank you":"ÿ¥⁄©ÿ±€å€Å", "thanks":"ÿ¥⁄©ÿ±€å€Å", "how are you":"ÿ¢Ÿæ ⁄©€åÿ≥€í €Å€å⁄∫", "bye":"ÿßŸÑŸàÿØÿßÿπ"}

def translate(text):
    text = text.strip().lower()
    if not text: return "English text?"
    phrase = ' '.join(text.split()[:3])
    if phrase in FALLBACKS: return FALLBACKS[phrase]

    tokens = [src_vocab.get(w, 1) for w in text.split()]
    src = torch.zeros(1, 50, dtype=torch.long, device=device); src[0,:len(tokens)] = torch.tensor(tokens)
    tgt = torch.tensor([[2]], device=device)

    with torch.no_grad():
        for _ in range(20):
            mask = torch.tril(torch.ones(1, tgt.size(1), tgt.size(1), device=device))
            logits = model(src, tgt, None, mask)
            next_tok = logits[0,-1].argmax().item()
            tgt = torch.cat([tgt, torch.tensor([[next_tok]], device=device)], dim=1)
            if next_tok == 3: break

    out = []; toks = tgt[0,1:].tolist()
    for tok in toks:
        if tok == 3: break
        if 4 <= tok < len(tgt_idx2word): out.append(tgt_idx2word[tok])
    return ' '.join(out) or "No translation"

gr.Interface(translate,
             gr.Textbox(placeholder="hi, hello, thank you..."),
             gr.Textbox(),
             title=" English‚ÜíUrdu Transformer",
             examples=[["hi"],["hello"],["thank you"]]).launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3625e82c150554eb6c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


