<a href="https://colab.research.google.com/github/pnabende/spelling-correction-for-East-African-languages/blob/master/spellingCorrectionBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForMaskedLM
import torch

In [9]:
# Check if GPU is available
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("cuda")
!nvidia-smi


Tue Apr  4 11:11:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    27W /  70W |    601MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/research/spelling-correction/data/1000random-3error-train-set-luganda.csv")

In [12]:
# Split dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


In [13]:
# Tokenize train and validation sets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_tokens = tokenizer(list(train_df["n'ebimgli"]), padding=True, truncation=True, return_tensors='pt').to(device)
val_tokens = tokenizer(list(val_df["n'ebimgli"]), padding=True, truncation=True, return_tensors='pt').to(device)


In [None]:
# Train model
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
model.train()
for epoch in range(5):
    optimizer.zero_grad()
    input_ids = train_tokens['input_ids']
    labels = input_ids.clone()
    labels[labels == tokenizer.pad_token_id] = -100
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print("Epoch:", epoch, "Loss:", loss.item())

In [None]:
# Evaluate model on test set
test_df = pd.read_csv("test_word_pairs.csv")
test_tokens = tokenizer(list(test_df['incorrect']), padding=True, truncation=True, return_tensors='pt')
model.eval()
correct_count = 0
total_count = len(test_df)
for i, row in test_df.iterrows():
    incorrect = row['incorrect']
    correct = row['correct']
    input_ids = test_tokens['input_ids'][i]
    mask_index = input_ids.tolist().index(tokenizer.mask_token_id)
    input_ids[mask_index] = tokenizer.convert_tokens_to_ids([incorrect])[0]
    outputs = model(input_ids.unsqueeze(0))
    predicted_id = torch.argmax(outputs.logits[0][mask_index]).item()
    predicted_word = tokenizer.convert_ids_to_tokens([predicted_id])[0]
    if predicted_word == correct:
        correct_count += 1
print("Test set accuracy:", correct_count/total_count)
