## Student details
Student name: **Siddharth Prince**  
Student id: **23052058**

# Task 1

## A. Downloading Peter Norwig's unigram corpus

In [1]:
# Housekeeping - imports and utility functions (taken from my previous code for Etivity 3)
import pandas as pd
import jellyfish

pd.set_option('display.max_rows', None)

def processNGramData(ngramContent):
    data = {'Word': [], 'Count': []}
    for word in ngramContent:
        dataPoint = word.split('\t')
        data['Word'].append(dataPoint[0])
        data['Count'].append(int(dataPoint[1]))
    return pd.DataFrame(data)

# Method to search the bigram data for the seed word and get all corresponding matches.
def searchDataFrame(df, searchTerm: str, columnToSearch: str, matchWord: bool=True):
    searchRegex = f"^{searchTerm}$" if matchWord else searchTerm
    return df.loc[df[columnToSearch].str.contains(searchRegex, regex=True)]

In [2]:
# Downloading unigram data
!wget https://norvig.com/ngrams/count_1w.txt
with open("./count_1w.txt", "r") as unigramFile:
    unigramContent = unigramFile.read().splitlines()
unigrams_df = processNGramData(unigramContent)
totalUnigrams = len(unigrams_df)
display(unigrams_df.head(100))

--2023-11-05 14:59:00--  https://norvig.com/ngrams/count_1w.txt
Resolving norvig.com (norvig.com)... 158.106.138.13
Connecting to norvig.com (norvig.com)|158.106.138.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4956241 (4.7M) [text/plain]
Saving to: ‘count_1w.txt.2’


2023-11-05 14:59:02 (3.75 MB/s) - ‘count_1w.txt.2’ saved [4956241/4956241]



Unnamed: 0,Word,Count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698
5,in,8469404971
6,for,5933321709
7,is,4705743816
8,on,3750423199
9,that,3400031103


## B. Spelling correction function that prints out possible candidates to replace the misspelt word

In [3]:
# Computing all the probabilities for each unigram and adding a column to the unigram data frame
totalUnigramCount = unigrams_df['Count'].sum()
print(f'Total unigram count: {totalUnigramCount}')
unigrams_df['P(Word)'] = unigrams_df['Count']/totalUnigramCount
display(unigrams_df.head(100))

Total unigram count: 588124220187


Unnamed: 0,Word,Count,P(Word)
0,the,23135851162,0.039338
1,of,13151942776,0.022363
2,and,12997637966,0.0221
3,to,12136980858,0.020637
4,a,9081174698,0.015441
5,in,8469404971,0.014401
6,for,5933321709,0.010089
7,is,4705743816,0.008001
8,on,3750423199,0.006377
9,that,3400031103,0.005781


In [56]:
def nonWordSpellingCorrection(nonWord):
    candidates = unigrams_df.copy()
    candidates['likely'] = candidates['Word'].apply(lambda x: jellyfish.damerau_levenshtein_distance(nonWord, str(x)) < 2)
    candidates = candidates[candidates['likely']==True].iloc[:, :3]
    candidates_sorted = candidates.sort_values(by='P(Word)', ascending=False)
    return candidates_sorted

In [57]:
nonWord = 'acress'
candidates_df = nonWordSpellingCorrection(nonWord)
print(f'nonWord = {nonWord}')
display(candidates_df.head(100))

nonWord = acress


Unnamed: 0,Word,Count,P(Word)
323,access,217986984,0.0003706479
1056,across,76597151,0.0001302397
4753,acres,14208905,2.41597e-05
8007,actress,7010056,1.191935e-05
26545,adress,984657,1.674233e-06
35864,caress,590047,1.003269e-06
55409,cress,279364,4.750085e-07
66694,apress,202431,3.441977e-07
73185,acess,171785,2.920897e-07


In [54]:
nonWord = 'teh'
candidates_df = nonWordSpellingCorrection(nonWord)
print(f'nonWord = {nonWord}')
display(candidates_df.head(100))

nonWord = teh


Unnamed: 0,Word,Count,P(Word)
0,the,23135851162,0.03933838
857,tech,93401669,0.0001588128
1340,tel,60827708,0.0001034266
1754,ten,46907473,7.975776e-05
1758,th,46857152,7.96722e-05
2861,tea,27406794,4.660035e-05
4544,te,15057406,2.560242e-05
5549,tee,11539905,1.962154e-05
6214,ted,9926083,1.687753e-05
7750,tex,7342192,1.248408e-05


In [53]:
nonWord = 'htis'
candidates_df = nonWordSpellingCorrection(nonWord)
print(f'nonWord = {nonWord}')
display(candidates_df.head(100))

nonWord = htis


Unnamed: 0,Word,Count,P(Word)
11,this,3228469771,0.005489435
66,his,660177731,0.001122514
1755,hits,46901429,7.974749e-05
13293,tis,3210106,5.458211e-06
18636,otis,1809809,3.077256e-06
24156,hts,1157369,1.967899e-06
27118,stis,946232,1.608898e-06
43878,itis,415910,7.071805e-07
52798,huis,303310,5.157244e-07
54660,ntis,286327,4.868478e-07


# Task 2

In [6]:
from textblob import Word

def getSpellingSuggestions(word):
    return Word(word).spellcheck()

print(f'String={"acress"}')
print(f'candidates = {getSpellingSuggestions("acress")}')

String=acress
candidates = [('across', 0.6851851851851852), ('access', 0.1728395061728395), ('acres', 0.1111111111111111), ('actress', 0.021604938271604937), ('caress', 0.009259259259259259)]


# Task 3

In [7]:
from textblob import TextBlob

def correctSentence(sentence):
    return TextBlob(sentence).correct()

print(f'sentence={"I have good speling"}\nCorrected sentence= {correctSentence("I have good speling")}')

sentence=I have good speling
Corrected sentence= I have good spelling
