<a href="https://colab.research.google.com/github/pnabende/spelling-correction-for-East-African-languages/blob/master/neural_network_from_torch_spelling_correction_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset, DataLoader

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# Define the SpellingCorrectionModel
class SpellingCorrectionModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super(SpellingCorrectionModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.fc = nn.Linear(embedding_dim, num_embeddings)

    def forward(self, x):
        x = self.embedding(x)
        x = self.fc(x)
        return x

In [17]:
# Define the SpellingCorrectionDataset
class SpellingCorrectionDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        incorrect_word = self.data.loc[idx, 'incorrect_word']
        correct_word = self.data.loc[idx, 'correct_word']
        
        return incorrect_word, correct_word

In [18]:
# Training parameters
num_epochs = 10
batch_size = 64
learning_rate = 0.001
embedding_dim = 100  # Adjust based on your embedding size

In [19]:
# Paths to CSV files
train_csv_file = '/content/drive/MyDrive/research/spelling-correction/data/6000random-3error-types-lugandaApost.csv'
test_csv_file = '/content/drive/MyDrive/research/spelling-correction/data/600-3error-test-set-lugandaApost.csv'



In [20]:
# Combine train and test datasets to generate vocabulary
train_dataset = SpellingCorrectionDataset(train_csv_file)
test_dataset = SpellingCorrectionDataset(test_csv_file)
combined_dataset = train_dataset + test_dataset

In [21]:
# Generate vocabulary mapping
vocabulary = set()
for incorrect_word, correct_word in combined_dataset:
    vocabulary.add(incorrect_word)
    vocabulary.add(correct_word)

In [22]:
# Create vocabulary mapping
word_to_index = {word: index for index, word in enumerate(vocabulary)}
num_embeddings = len(word_to_index)

In [23]:
# Update SpellingCorrectionDataset to return indices instead of words
class SpellingCorrectionDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        incorrect_word = self.data.loc[idx, 'incorrect_word']
        correct_word = self.data.loc[idx, 'correct_word']
        
        incorrect_index = word_to_index.get(incorrect_word, 0)
        correct_index = word_to_index.get(correct_word, 0)
        
        return incorrect_index, correct_index

In [24]:
# Create the datasets and data loaders
train_dataset = SpellingCorrectionDataset(train_csv_file)
test_dataset = SpellingCorrectionDataset(test_csv_file)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [25]:
# Initialize the model and the optimizer
model = SpellingCorrectionModel(num_embeddings, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [26]:
# Training loop
total_steps = len(train_dataloader)
for epoch in range(num_epochs):
    for i, (incorrect_words, correct_words) in enumerate(train_dataloader):
        # Forward pass
        outputs = model(incorrect_words)
        loss = criterion(outputs, correct_words)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_steps}], Loss: {loss.item():.4f}')

In [27]:
# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    for incorrect_words, correct_words in test_dataloader:
        outputs = model(incorrect_words)
        _, predicted = torch.max(outputs.data, 1)
        total += correct_words.size(0)
        correct += (predicted == correct_words).sum().item()

    print(f'Accuracy of the model on the test dataset: {(correct / total) * 100:.2f}%')
    

Accuracy of the model on the test dataset: 98.33%
