In [1]:
# 提前工作：检查并修改当前的工作路径
import os
os.chdir('/Users/xujie/Documents/GitHub/Spell_errors_text_programming/data')
os.getcwd()

'/Users/xujie/Desktop/Python/nlp/data/spell_errors'

In [2]:
# 提取本地文件并生成单词词库
vocab = [line.rstrip().lower() for line in open('spell_errors_vocab.txt')]
print(len(vocab))

48227


In [3]:
# 生成拼写错误单词的所有候选集合
def generate_candidates(word, ex=False):
    """
    word: 用户输入的错误单词
    return: 返回所有与word编辑距离为1的候选单词
    """
    letters = 'abcdefghijklmnopqrstuvwxyz'
    
    splits = [(word[:i], word[i:]) for i in range(len(word)+1)]
    
    # 1、insert
    inserts = [L+c+R for L,R in splits for c in letters]
    # 2、delete
    deletes = [L+R[1:] for L,R in splits if R]
    # 3、replace
    replaces = [L+c+R[1:] for L,R in splits for c in letters if R]
    
    candidates = set(inserts+deletes+replaces)
    
    # 当控制参数ex为True时，生成与输入编辑距离为2的所有候选单词
    if ex:
        candidates = set([w2 for w1 in candidates for w2 in generate_candidates(w1)])
        
    return [word for word in candidates if word in vocab]

# print函数输出结果，以‘apple’为例
print(generate_candidates('apple'))

['ample', 'apple', 'apply', 'apples']


In [4]:
# 用户单词拼写错误的条件概率统计
channel_prob = {}

for line in open('spell_errors_probs.txt'):
    items = line.split(':')
    correct = items[0].strip()
    mistakes = [mis.strip() for mis in items[1].split(',')]
    channel_prob[correct] = {}
    for mis in mistakes:
        channel_prob[correct][mis] = 1.0/len(mistakes)
        
# print channel_prob，以‘apple’为例
print(channel_prob['apple'])

{'alipple': 0.047619047619047616, 'apoll': 0.047619047619047616, 'alpper': 0.047619047619047616, 'appy': 0.047619047619047616, 'alpple': 0.047619047619047616, 'ait': 0.047619047619047616, 'appel': 0.047619047619047616, 'appre': 0.047619047619047616, 'abuol': 0.047619047619047616, 'apelle': 0.047619047619047616, 'appple': 0.047619047619047616, 'alploo': 0.047619047619047616, 'alppe': 0.047619047619047616, 'apl': 0.047619047619047616, 'apll': 0.047619047619047616, 'apply': 0.047619047619047616, 'alppel': 0.047619047619047616, 'aplep': 0.047619047619047616, 'apoler': 0.047619047619047616, 'appe': 0.047619047619047616, 'aple': 0.047619047619047616}


In [5]:
# 导入nltk库提取路透社语料库数据
from nltk.corpus import reuters
categories = reuters.categories()
corpus = reuters.sents(categories=categories)

In [6]:
# 训练2-Gram语言模型
import re

punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
term_count, bigram_count = {}, {}

for line in corpus:
    line = [re.sub(r'[{}]+'.format(punctuation), '', word) for word in line]
    for i in range(len(line)):
        term = line[i].lower()
        bigram = ''.join(line[i:i+2]).lower() if i < len(line)-1 else ''
        
        term_count[term] = term_count[term]+1 if term in term_count else 1
        if bigram:
            bigram_count[bigram] = bigram_count[bigram]+1 if bigram in bigram_count else 1

In [46]:
# 对与训练文本进行拼写纠错
import numpy as np

V = len(term_count)
file = open('spell_errors_test_data.txt', 'r')

for line in file:
    items = line.split('\t')
    line = items[2].strip().split()
    for idx in range(len(line)):
        word = re.sub(r'[{}]+'.format(punctuation), '', line[idx].lower())
        if word and word not in vocab:
            
            # 排除当word为纯数字的情况
            match = re.match(r'[0-9]+', word)
            if match: continue
            
            # 排除当word为复数形式（s, es结尾）和表示从属关系（‘s, s'结尾）的情况
            if len(word) >= 1 and word[-1] == 's' and word[:-1] in vocab: continue
            if len(word) >= 2 and word[-2:] in ['\'s', 's\'', 'es'] and word[:-2] in vocab: continue
            
            # 通过generate_candidates()函数生成与word编辑距离为1的候选集
            candidates = generate_candidates(word)
            if len(candidates) < 1: continue
            
            probs = []
            for candi in candidates:
                # 贝叶斯：P(c|w) = P(w|c)P(c) = P(w|c)P(c|pre)P(suc|c)
                prob = 0
                if candi in channel_prob and word in channel_prob[candi]:
                    prob += np.log(channel_prob[candi][word])
                else:
                    prob += np.log(0.0001)
                
                pre = re.sub(r'[{}]+'.format(punctuation), '', line[idx-1].lower()) if idx > 0 else ''
                suc = re.sub(r'[{}]+'.format(punctuation), '', line[idx-1].lower()) if idx < len(line)-1 else ''
                
                # 注意计算时的add_one smooth平滑化处理
                if pre and pre+candi in bigram_count and pre in term_count:
                    prob += np.log((bigram_count[pre+candi]+1.0)/(term_count[pre]+V))
                else:
                    prob += np.log(1.0/V)
                
                if suc and candi+suc in bigram_count and candi in term_count:
                    prob += np.log((bigram_count[candi+suc]+1.0)/(term_count[candi]+V))
                else:
                    prob += np.log(1.0/V)
                
                probs.append(prob)
            
            max_idx = probs.index(max(probs))
            # 输出单词拼写纠错的改正结果
            print('{} ---> {}'.format(word, candidates[max_idx]))

protectionst ---> protectionist
retaiation ---> retaliation
tases ---> oases
busines ---> business
seriousnyss ---> seriousness
aganst ---> against
sewll ---> sell
importsi ---> imports
sheem ---> seem
koreva ---> korea
japn ---> jan
semicondctors ---> semiconductors
advantagne ---> advantage
lawrenc ---> lawrence
disadxantage ---> disadvantage
conceern ---> concern
cenntred ---> centred
trad ---> trap
liberala ---> liberals
inoclude ---> include
representetive ---> representative
ootput ---> output
methids ---> methods
producton ---> production
parep ---> pare
endergy ---> energy
japnese ---> japanese
agencay ---> agency
energyr ---> energy
pfovided ---> provided
imhorts ---> imports
incease ---> incense
quartee ---> quartet
expanted ---> expanded
exeport ---> export
clotcing ---> clothing
intehnational ---> international
markats ---> markets
movament ---> movement
affecced ---> affected
evfect ---> effect
beng ---> bend
rubbes ---> rubies
cautiousy ---> cautious
safly ---> sally
phys

pioposal ---> proposal
suorprise ---> surprise
mainetain ---> maintain
earliuer ---> earlier
stockhouders ---> stockholders
csc ---> ccc
nj ---> n
caltel ---> cartel
nj ---> n
nj ---> n
nj ---> n
fla ---> fela
sc ---> sd
fago ---> sago
saple ---> sample
presidenti ---> presidents
plce ---> plc
damge ---> dame
rainwfall ---> rainfall
soybaan ---> soybean
betwen ---> between
yiild ---> yield
belowo ---> below
harveesting ---> harvesting
fisgure ---> figure
hectres ---> hectares
stert ---> stern
comared ---> compared
seeson ---> season
aseas ---> seas
aretas ---> arenas
preparatin ---> preparation
fingure ---> figure
unemploymnt ---> unemployment
figurer ---> figured
imcrease ---> increase
vacamcies ---> vacancies
miyauzawa ---> miyazawa
canadiaen ---> canadian
issied ---> issued
bakem ---> bakes
gonia ---> goria
billiono ---> billions
profyt ---> profet
divdend ---> dividend
divdends ---> dividends
sebject ---> subject
commol ---> common
addad ---> added
quartr ---> quarts
sevred ---> se

earningc ---> earning
fitscal ---> fiscal
harmarx ---> hartmarx
manuacturer ---> manufacturer
meinerot ---> meinert
diveisions ---> diversions
acquisictions ---> acquisitions
atlana ---> atlanta
st ---> sd
washngton ---> washington
agreeent ---> agreement
productioin ---> production
developmenat ---> development
delivry ---> delivre
ornge ---> orange
calif ---> calf
inflatihon ---> inflation
largelyh ---> largely
modaest ---> modest
aggrekgates ---> aggregates
rathetr ---> rather
thedre ---> theare
produhcers ---> producers
futcre ---> future
inflanionary ---> inflationary
sectr ---> sect
returun ---> return
stagflatiion ---> stagflation
resalt ---> resale
disciplinaed ---> disciplined
conducl ---> conduct
spendimg ---> spending
rafther ---> rafter
tarxes ---> taxes
ceratainly ---> certainly
didea ---> idea
expoats ---> exports
trad ---> trade
counrty ---> county
properoties ---> properties
quartr ---> quarts
eneergy ---> energy
limoited ---> limited
partnersahip ---> partnership
total