In [None]:
import random
import nltk
from nltk.corpus import words
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence


In [None]:
# Download NLTK word list
nltk.download('words')
word_list = words.words()

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
# Limit dataset for demonstration
word_list = word_list[:5000]

In [None]:
# Noise Functions
def add_noise(word):
    """Introduce spelling mistakes in a word."""
    operations = [omit_character, swap_adjacent, insert_random, replace_character]
    return random.choice(operations)(word)


In [None]:
def omit_character(word):
    """Omit a random character."""
    if len(word) > 1:
        idx = random.randint(0, len(word) - 1)
        return word[:idx] + word[idx+1:]
    return word

def swap_adjacent(word):
    """Swap two adjacent characters."""
    if len(word) > 1:
        idx = random.randint(0, len(word) - 2)
        return word[:idx] + word[idx+1] + word[idx] + word[idx+2:]
    return word

In [None]:
def insert_random(word):
    """Insert a random character."""
    idx = random.randint(0, len(word))
    return word[:idx] + random.choice("abcdefghijklmnopqrstuvwxyz") + word[idx:]

def replace_character(word):
    """Replace a random character."""
    if len(word) > 1:
        idx = random.randint(0, len(word) - 1)
        return word[:idx] + random.choice("abcdefghijklmnopqrstuvwxyz") + word[idx+1:]
    return word

In [None]:
# Dataset Preparation
class SpellingDataset(Dataset):
    def __init__(self, word_list, max_len=10):
        self.data = []
        self.labels = []
        self.max_len = max_len

        for word in word_list:
            self.data.append(word)  # Correct word
            self.labels.append(1)  # Label for correct

            noisy_word = add_noise(word)  # Introduce noise
            self.data.append(noisy_word)  # Noisy word
            self.labels.append(0)  # Label for noisy

        self.char_to_idx = self.build_vocab(self.data)
        self.vocab_size = len(self.char_to_idx)

    def build_vocab(self, words):
        """Build a character vocabulary."""
        vocab = set("abcdefghijklmnopqrstuvwxyz")  # Restrict to lowercase
        vocab = {char: idx+1 for idx, char in enumerate(sorted(vocab))}  # Start indices from 1
        vocab["<pad>"] = 0  # Padding token
        return vocab

    def word_to_sequence(self, word):
        """Convert word to a sequence of character indices."""
        seq = [self.char_to_idx.get(char, 0) for char in word]  # Map characters to indices
        return seq

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word = self.data[idx]
        label = self.labels[idx]
        seq = self.word_to_sequence(word)
        return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.float)


In [None]:
def collate_fn(batch):
    """Collate function for padding sequences."""
    words, labels = zip(*batch)
    padded_words = pad_sequence(words, batch_first=True, padding_value=0)
    labels = torch.tensor(labels, dtype=torch.float)
    return padded_words, labels


In [None]:
# Dataset and DataLoader
dataset = SpellingDataset(word_list)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [None]:
# Define RNN Model
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, hidden_dim=32):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)  # Embed characters
        _, hidden = self.rnn(x)  # Get the final hidden state
        hidden = hidden.squeeze(0)
        out = self.fc(hidden)  # Fully connected layer
        return self.sigmoid(out)

In [None]:
# Model, Loss, Optimizer
model = RNN(vocab_size=dataset.vocab_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
epochs = 200
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for inputs, labels in dataloader:
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

NameError: name 'RNN' is not defined

In [None]:
# Testing the Model
def test_word(word, model, dataset):
    """Test if a word is correctly or incorrectly spelled."""
    model.eval()
    seq = dataset.word_to_sequence(word)
    seq = torch.tensor(seq, dtype=torch.long).unsqueeze(0)
    with torch.no_grad():
        prediction = model(seq).item()
    return "Correct" if prediction > 0.5 else "Incorrect"

# Test Examples
test_words = ["actor", "acclimate", "bsaj", "ple"]
for word in test_words:
    print(f"Word: {word}, Prediction: {test_word(word, model, dataset)}")

Word: actor, Prediction: Correct
Word: acclimate, Prediction: Correct
Word: bsaj, Prediction: Incorrect
Word: ple, Prediction: Incorrect


In [None]:
word_list[:5000]

['A',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'aba',
 'Ababdeh',
 'Ababua',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'Abadite',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'Abama',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'Abanic',
 'Abantes',
 'abaptiston',
 'Abarambo',
 'Abaris',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'Abasgi',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'Abassin',
 'abastardize',
 'abatable',
 'abate