In [None]:
%pip install torch torchvision torchaudio transformers datasets nltk numpy pandas


In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertForMaskedLM
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict



In [23]:
# Load dataset
with open("wikipedia.dat", "r") as file:
    raw_data = file.readlines()

# Process data
corrections = {}
current_correct_word = None

for line in raw_data:
    word = line.strip()
    if word.startswith("$"):  # Correct words start with $
        current_correct_word = word[1:]
        corrections[current_correct_word] = []
    else:
        if current_correct_word:
            corrections[current_correct_word].append(word)

# Convert into a DataFrame
df = pd.DataFrame([(k, v) for k, values in corrections.items() for v in values],
                  columns=["Correct_Word", "Misspelled_Word"])

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("Training samples:", len(train_df))
print("Testing samples:", len(test_df))
df.head()


Training samples: 1964
Testing samples: 491


Unnamed: 0,Correct_Word,Misspelled_Word
0,Apennines,Apenines
1,Apennines,Appenines
2,Athenian,Athenean
3,Athenians,Atheneans
4,Bernoulli,Bernouilli


In [24]:
# Function to calculate Levenshtein Distance
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

# Function to find the closest correct word
def find_closest_word(misspelled_word, correct_words):
    distances = [(word, levenshtein_distance(misspelled_word, word)) for word in correct_words]
    closest_word = min(distances, key=lambda x: x[1])[0]
    return closest_word

# Evaluate the model on the test set
correct_words = train_df["Correct_Word"].unique()
test_df["Predicted_Correct_Word"] = test_df["Misspelled_Word"].apply(lambda x: find_closest_word(x, correct_words))

# Calculate accuracy
accuracy = np.mean(test_df["Correct_Word"] == test_df["Predicted_Correct_Word"])
print("Accuracy:", accuracy)

test_df.head()

Accuracy: 0.23421588594704684


Unnamed: 0,Correct_Word,Misspelled_Word,Predicted_Correct_Word
1598,patented,pattented,attempted
620,contributors,contributers,contributor
1266,keratin,ceratin,erratic
649,correspondents,correspondants,correspondent
1908,responsibility,responnsibilty,responsible


In [25]:
# Build n-gram frequency models
def build_ngram_model(words, n=2):
    ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
    ngram_freq = Counter(ngrams)
    return ngram_freq

correct_words = train_df["Correct_Word"].tolist()
unigram_freq = Counter(correct_words)
bigram_freq = build_ngram_model(correct_words, n=2)

# Function to calculate word likelihood
def word_likelihood(word, unigram_freq, bigram_freq):
    likelihood = unigram_freq[word] / sum(unigram_freq.values())
    for i in range(len(word)-1):
        bigram = (word[i], word[i+1])
        likelihood *= (bigram_freq[bigram] + 1) / (unigram_freq[word[i]] + len(unigram_freq))
    return likelihood

# Function to find the most likely correct word
def find_most_likely_word(misspelled_word, correct_words, unigram_freq, bigram_freq):
    likelihoods = [(word, word_likelihood(word, unigram_freq, bigram_freq)) for word in correct_words]
    most_likely_word = max(likelihoods, key=lambda x: x[1])[0]
    return most_likely_word

# Evaluate the model on the test set
test_df["Predicted_Correct_Word"] = test_df["Misspelled_Word"].apply(lambda x: find_most_likely_word(x, correct_words, unigram_freq, bigram_freq))

# Calculate accuracy
accuracy = np.mean(test_df["Correct_Word"] == test_df["Predicted_Correct_Word"])
print("Accuracy:", accuracy)

test_df.head()

Accuracy: 0.002036659877800407


Unnamed: 0,Correct_Word,Misspelled_Word,Predicted_Correct_Word
1598,patented,pattented,the
620,contributors,contributers,the
1266,keratin,ceratin,the
649,correspondents,correspondants,the
1908,responsibility,responnsibilty,the


In [26]:
# Build a model of common spelling mistakes
def build_error_model(df):
    error_model = defaultdict(Counter)
    for _, row in df.iterrows():
        correct_word = row["Correct_Word"]
        misspelled_word = row["Misspelled_Word"]
        for i in range(min(len(correct_word), len(misspelled_word))):
            if correct_word[i] != misspelled_word[i]:
                error_model[correct_word[i]][misspelled_word[i]] += 1
    return error_model

error_model = build_error_model(train_df)

# Function to calculate word probability given the noisy channel model
def word_probability(word, correct_word, error_model):
    probability = 1.0
    for i in range(min(len(word), len(correct_word))):
        if word[i] != correct_word[i]:
            denominator = sum(error_model[correct_word[i]].values()) + len(error_model[correct_word[i]])
            if denominator == 0:
                denominator = 1  # Avoid dividing by zero
            probability *= (error_model[correct_word[i]][word[i]] + 1) / denominator
    return probability

# Function to find the most likely correct word
def find_most_likely_word(misspelled_word, correct_words, error_model):
    probabilities = [(word, word_probability(misspelled_word, word, error_model)) for word in correct_words]
    most_likely_word = max(probabilities, key=lambda x: x[1])[0]
    return most_likely_word

# Evaluate the model on the test set
correct_words = train_df["Correct_Word"].unique()
test_df["Predicted_Correct_Word"] = test_df["Misspelled_Word"].apply(lambda x: find_most_likely_word(x, correct_words, error_model))

# Calculate accuracy
accuracy = np.mean(test_df["Correct_Word"] == test_df["Predicted_Correct_Word"])
print("Accuracy:", accuracy)

test_df.head()

Accuracy: 0.06313645621181263


Unnamed: 0,Correct_Word,Misspelled_Word,Predicted_Correct_Word
1598,patented,pattented,just
620,contributors,contributers,wont
1266,keratin,ceratin,certainty
649,correspondents,correspondants,correspond
1908,responsibility,responnsibilty,was


In [4]:
class SpellingDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.vocab = set("".join(data["Misspelled_Word"]) + "".join(data["Correct_Word"]))
        self.char2idx = {ch: i+1 for i, ch in enumerate(self.vocab)}  # +1 to reserve 0 for padding
        self.idx2char = {i: ch for ch, i in self.char2idx.items()}
        self.max_len = max(data["Misspelled_Word"].apply(len))  # Max word length
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        misspelled = [self.char2idx[ch] for ch in row["Misspelled_Word"]]
        correct = [self.char2idx[ch] for ch in row["Correct_Word"]]
        
        # Pad sequences
        misspelled += [0] * (self.max_len - len(misspelled))
        correct += [0] * (self.max_len - len(correct))
        
        return torch.tensor(misspelled), torch.tensor(correct)

# Load dataset
train_dataset = SpellingDataset(train_df)
test_dataset = SpellingDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [5]:
class Seq2SeqModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32, hidden_dim=64):
        super(Seq2SeqModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim)  # +1 for padding
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size + 1)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.encoder(embedded)
        out, _ = self.decoder(embedded, (hidden, torch.zeros_like(hidden)))
        return self.fc(out)

# Initialize model
vocab_size = len(train_dataset.vocab)
model = Seq2SeqModel(vocab_size)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [7]:
def collate_fn(batch):
    misspelled, correct = zip(*batch)
    misspelled = pad_sequence(misspelled, batch_first=True, padding_value=0)
    correct = pad_sequence(correct, batch_first=True, padding_value=0)
    return misspelled, correct

batch_size = 32  # Define batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [8]:
for misspelled, correct in train_loader:
    optimizer.zero_grad()
    
    outputs = model(misspelled)  # Forward pass
    
    # Debugging shape mismatch issue
    print(f"Output shape: {outputs.shape}")  # Expected: (batch_size, seq_len, vocab_size)
    print(f"Target shape: {correct.shape}")  # Expected: (batch_size, seq_len)

    batch_size, seq_len, vocab_size = outputs.shape  # Unpack output dimensions

    # Ensure the correct tensor is the same shape as the output tensor
    correct = correct[:, :seq_len]

    # Reshape tensors correctly for loss computation
    loss = criterion(
        outputs.reshape(batch_size * seq_len, vocab_size),  # Flatten output for cross-entropy
        correct.reshape(-1)  # Flatten target labels
    )

    loss.backward()
    optimizer.step()
    
    print(f"Loss: {loss.item()}")  # Monitor loss


Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 4.0567402839660645
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 4.001291275024414
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 3.950040578842163
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 3.8977274894714355
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 3.8472204208374023
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 3.7829954624176025
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 3.731424570083618
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 3.686861753463745
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 3.6226205825805664
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 3.5372884273529053
Output shape: 

Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 2.022690773010254
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 1.9857796430587769
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 1.8689550161361694
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 2.012312889099121
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 1.9905897378921509
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 1.7614903450012207
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 1.86761474609375
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 1.892130732536316
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 1.8248835802078247
Output shape: torch.Size([32, 18, 55])
Target shape: torch.Size([32, 18])
Loss: 1.8501707315444946
Output shape: t

In [12]:
model.eval()
with torch.no_grad():
    for misspelled, correct in test_loader:
        outputs = model(misspelled)
        predictions = torch.argmax(outputs, dim=-1)
        print("Misspelled:", "".join([train_dataset.idx2char[idx.item()] for idx in misspelled[0] if idx.item() != 0]))
        print("Predicted:", "".join([train_dataset.idx2char[idx.item()] for idx in predictions[0] if idx.item() != 0]))
        print("Correct:", "".join([train_dataset.idx2char[idx.item()] for idx in correct[0] if idx.item() != 0]))
        break


Misspelled: zF-qdydjsp
Predicted: oreedyddsp
Correct: zF-qydjsp
