#Cook Islands Maori Spelling Checker

Samuel Peter (samuel.peter.25@dartmouth.edu)<br>
Accelerated Computational Linguistics(Spring 2024)

##Step 1: Import the necessary packages

In [1]:
import re
from collections import Counter

##Step 2: Get file name and words in the .txt file

In [2]:
def words(text): return re.findall(r'\w+', text.lower())

#Change filename here
WORDS = Counter(words(open('big.txt').read()))

##Step 3: Sample code from https://norvig.com/spell-correct.html

In [3]:
def P(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N

## Function modified to return the top suggestion and list of possible suggestions
def correction(word):
    "Most probable spelling correction for word."
    candidates_list = candidates(word)
    max_candidate = max(candidates_list, key=P)
    return candidates_list, max_candidate

def candidates(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

##Step 4: Test the sample code

In [4]:
candidates_list, max_candidate = correction('speling')
print(max_candidate)

spelling


In [5]:
candidates_list, max_candidate = correction('korrectud')
print(max_candidate)

corrected


##Step 5: Change the file to the Cook Islands Maori text file for training the model

In [6]:
WORDS = Counter(words(open('cim-sentences.txt').read()))

##Step 6: Test the model with Maori words

In [7]:
candidates_list, max_candidate = correction('kotoo')
print(max_candidate)

kotou


In [8]:
candidates_list, max_candidate = correction('aere')
print(max_candidate)
print(candidates_list)

qaere
{'qaere', 'rere', 'tere', 'mere'}


##Step 6: Method to go through each token in the sentence and correct any misspellings while accounting for punctuations

In [9]:
#Method: autocorrect - method to check the spelling of Maori words from user input
#Parameters: raw_tokens - list of words, with punctuations, from users' input
#            line       - user's input
def autocorrect(line, raw_tokens):
  print("Sentence before processing the tokens: ")
  print(line)
  print("")

  x = ""    #temp variable
  processed_tokens = "" #string to store corrected sentence
  candidates_list = []  #list to store possible suggestions for misspelling
  max_candidate = ""    #variable to store most likely suggestion

  #Loop for each token
  for token in raw_tokens:
    #Make sure it is a word and not a special character and remove punctuation at the end of the word simultaneously
    last_char = token[-1]
    if not last_char.isalpha():

      x = token[0:len(token)-1]
      #Convert to lower case and correct spelling if needed
      candidates_list, max_candidate = correction(x.lower())
      #Add punction back at the end of the word
      max_candidate = max_candidate + last_char

    else:

      x = token
      #Convert to lower case and correct spelling if needed
      candidates_list, max_candidate = correction(x.lower())


    #Add capitalization if the start of the token we checked was capitalized
    if token[0].isupper():
      max_candidate = max_candidate[0].upper() + max_candidate[1:]
    #Print possible suggestions if there is more than one item in candidates list, indicating a spelling error
    if token != max_candidate:
      print("=== Possible misspelling ===")
      print(token, ": ", candidates_list)
    #Finally add max_candidate to the list of processed tokens
    processed_tokens = processed_tokens + " " + max_candidate

  print()
  print("Autocorrected sentence after processing the tokens: ")
  print(processed_tokens)


##Step 7: Get user input in Cook Islands Maori to check for spelling

In [10]:
print("Please write a sentence in Cook Islands Maori and press ENTER to check the spelling: ")
line = input()  #Kia orana kotoo mai i Rarotoga!
print()
raw_tokens = line.split()
autocorrect(line, raw_tokens)

Please write a sentence in Cook Islands Maori and press ENTER to check the spelling: 
Kia orana kotoo mai i Rarotoga!

Sentence before processing the tokens: 
Kia orana kotoo mai i Rarotoga!

=== Possible misspelling ===
kotoo :  {'kotou'}
=== Possible misspelling ===
Rarotoga! :  {'rarotonga'}

Autocorrected sentence after processing the tokens: 
 Kia orana kotou mai i Rarotonga!


In [11]:
print("Please write a sentence in Cook Islands Maori and press ENTER to check the spelling: ")
line = input()  #Kua aere au ki Mauke.
print()
raw_tokens = line.split()
autocorrect(line, raw_tokens)

Please write a sentence in Cook Islands Maori and press ENTER to check the spelling: 
Kua aere au ki Mauke.

Sentence before processing the tokens: 
Kua aere au ki Mauke.

=== Possible misspelling ===
aere :  {'qaere', 'rere', 'tere', 'mere'}
=== Possible misspelling ===
Mauke. :  {'maquke'}

Autocorrected sentence after processing the tokens: 
 Kua qaere au ki Maquke.
