In [None]:
# Lab Assignment 3: Morphological Analysis with Finite State Transducers (FST) and Deep Learning
# •	Implement a Finite State Transducer (FST) for morphological parsing (e.g., handling verb conjugations and noun declensions in an Indian language like Hindi or Sanskrit).
# •	Train a sequence-to-sequence deep learning model (LSTM-based) to predict morphemes for unseen words.
# •	Compare performance between FST and deep learning approaches.

In [None]:
# Step 1 - Install Dependencies
!pip install nltk torch torchtext matplotlib --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m91.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Step 2: FST-Based Morphological Parser (Simple Demo)
def fst_morphological_parser(word):
    # Basic rule-based transducer
    rules = {
        'ing': '',     # running -> run
        'ed': '',      # jumped -> jump
        's': '',       # dogs -> dog
        'er': '',      # taller -> tall
    }

    for suffix, root_suffix in rules.items():
        if word.endswith(suffix):
            return word[:-len(suffix)] + root_suffix, suffix
    return word, ''  # no transformation

# Test FST
words = ['running', 'jumped', 'dogs', 'taller', 'run']
print("FST Morphological Analysis:")
for word in words:
    root, suffix = fst_morphological_parser(word)
    print(f"{word} -> root: {root}, suffix: {suffix}")

FST Morphological Analysis:
running -> root: runn, suffix: ing
jumped -> root: jump, suffix: ed
dogs -> root: dog, suffix: s
taller -> root: tall, suffix: er
run -> root: run, suffix: 


In [None]:
# Step 3: Sequence-to-Sequence Model (LSTM) for Morpheme Prediction
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [None]:
# Create toy dataset
dataset = [
    ('running', 'run ing'),
    ('jumped', 'jump ed'),
    ('taller', 'tall er'),
    ('dogs', 'dog s'),
    ('played', 'play ed'),
]

In [None]:
# Character-level tokenizer
def tokenize(word):
    return list(word)

In [None]:
# Vocabulary
all_chars = sorted(set("".join(w for w, _ in dataset) + " ".join(m for _, m in dataset)))
char2idx = {c: i + 1 for i, c in enumerate(all_chars)}  # +1 to reserve 0 for padding
idx2char = {i: c for c, i in char2idx.items()}

In [None]:
# Encoding
def encode(seq):
    return torch.tensor([char2idx[c] for c in seq], dtype=torch.long)

In [None]:
class MorphDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        return encode(tokenize(src)), encode(tokenize(tgt))

def collate_fn(batch):
    srcs, tgts = zip(*batch)
    srcs = pad_sequence(srcs, batch_first=True)
    tgts = pad_sequence(tgts, batch_first=True)
    return srcs, tgts

train_loader = DataLoader(MorphDataset(dataset), batch_size=2, shuffle=True, collate_fn=collate_fn)

# Seq2Seq model with LSTM
class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_size=32, hidden_size=64):
        super().__init__()
        self.embed = nn.Embedding(vocab_size + 1, embed_size, padding_idx=0)
        self.encoder = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size + 1)

    def forward(self, src, tgt):
        embedded_src = self.embed(src)
        _, (h, c) = self.encoder(embedded_src)

        embedded_tgt = self.embed(tgt)
        out, _ = self.decoder(embedded_tgt, (h, c))
        logits = self.fc(out)
        return logits

# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2Seq(len(char2idx)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

EPOCHS = 20
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        output = model(src, tgt[:, :-1])
        loss = loss_fn(output.view(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 8.5957
Epoch 2, Loss: 7.3428
Epoch 3, Loss: 6.3776
Epoch 4, Loss: 4.4763
Epoch 5, Loss: 3.1496
Epoch 6, Loss: 2.1778
Epoch 7, Loss: 1.9020
Epoch 8, Loss: 1.2863
Epoch 9, Loss: 0.7752
Epoch 10, Loss: 0.5774
Epoch 11, Loss: 0.4509
Epoch 12, Loss: 0.3277
Epoch 13, Loss: 0.2757
Epoch 14, Loss: 0.1757
Epoch 15, Loss: 0.1704
Epoch 16, Loss: 0.1005
Epoch 17, Loss: 0.0847
Epoch 18, Loss: 0.0751
Epoch 19, Loss: 0.0571
Epoch 20, Loss: 0.0449


In [None]:
# Step 4: Evaluate the LSTM Morphological Analyzer
def predict(model, word):
    model.eval()
    with torch.no_grad():
        src = encode(tokenize(word)).unsqueeze(0).to(device)
        embedded_src = model.embed(src)
        _, (h, c) = model.encoder(embedded_src)

        input_tgt = torch.tensor([[char2idx[' ']]], device=device)
        output_seq = []

        for _ in range(15):  # max output length
            embedded_tgt = model.embed(input_tgt)
            out, (h, c) = model.decoder(embedded_tgt, (h, c))
            logits = model.fc(out[:, -1, :])
            predicted = logits.argmax(dim=-1)
            char = idx2char.get(predicted.item(), '')
            if char == '':
                break
            output_seq.append(char)
            input_tgt = predicted.unsqueeze(0)
        return ''.join(output_seq)

In [None]:
# Test prediction
test_words = ['running', 'taller', 'dogs']
print("\nLSTM Predictions:")
for word in test_words:
    print(f"{word} -> {predict(model, word)}")


LSTM Predictions:
running -> un ing ing sssr
taller -> all errrrrg srs
dogs -> og ssssrsrsrsrs
