In [2]:
from collections import defaultdict

In [3]:
occur_thres=20
ambiguity_thres=0.97
bos=['<bos-1>','<bos-2>']
eos=['<eos-1>','<eos-2>']
bop=['<bop-1>','<bop-2>']
eop=['<eop-1>','<eop-2>']

In [4]:
def get_vocab(train_data):
    word_pos_counter={}
    pos_vocab=set()
    with open(train_data) as fr:
        for line in fr:
            if not line.strip():
                continue
            word,pos_tag=line.strip().split('\t')
            if word not in word_pos_counter:
                word_pos_counter[word]={}
            if pos_tag not in word_pos_counter[word]:
                word_pos_counter[word][pos_tag]=0
            word_pos_counter[word][pos_tag]+=1
            
            if pos_tag not in pos_vocab:
                pos_vocab.add(pos_tag)
    
    single_pos_dict={}
    for word,tag_freqs in word_pos_counter.items():
        tag,mode=max(tag_freqs.items(),key=lambda item:item[1])
        n=sum(tag_freqs.values())
        if n>=occur_thres and (float(mode)/n) >= ambiguity_thres:
            single_pos_dict[word] = tag
            
    return single_pos_dict,pos_vocab

In [5]:
single_pos_dict,pos_vocab=get_vocab('./train.pos')

In [6]:
def read_data(path):
    sentence=[]
    pos_tag=[]
    line_sent=[]
    line_pos=[]
    for sent in open(path):
        if not sent.strip():
            sentence.append(line_sent)
            pos_tag.append(line_pos)
            line_sent=[]
            line_pos=[]
            continue
        
        w,p=sent.strip().split('\t')
        line_sent.append(w)
        line_pos.append(p)
        
    return sentence,pos_tag

In [7]:
train_sent,train_pos=read_data('./train.pos')
print('train size:',len(train_sent))
dev_sent,dev_pos=read_data('./dev.pos')
print('dev size:',len(dev_sent))
test_sent,test_pos=read_data('./test.pos')
print('test size:',len(test_sent))

train size: 39830
dev size: 1700
test size: 2416


In [8]:
def get_features(i, word, context, prev, prev2):
    '''Map tokens into a feature representation, implemented as a
    {hashable: float} dict. If the features change, a new model must be
    trained.
    '''
    def add(name, *args):
        features[' '.join((name,) + tuple(args))] += 1

    i += len(bos)
    features = defaultdict(int)
    # It's useful to have a constant feature, which acts sort of like a prior
    add('bias')
    add('i suffix', word[-3:])
    add('i pref1', word[0])
    add('i-1 tag', prev)
    add('i-2 tag', prev2)
    add('i tag+i-2 tag', prev, prev2)
    add('i word', context[i])
    add('i-1 tag+i word', prev, context[i])
    add('i-1 word', context[i-1])
    add('i-1 suffix', context[i-1][-3:])
    add('i-2 word', context[i-2])
    add('i+1 word', context[i+1])
    add('i+1 suffix', context[i+1][-3:])
    add('i+2 word', context[i+2])
    return features

In [9]:
def normalize(word):
    '''Normalization used in pre-processing.
    - All words are lower cased
    - Digits in the range 1800-2100 are represented as !YEAR;
    - Other digits are represented as !DIGITS
    :rtype: str
    '''
    if '-' in word and word[0] != '-':
        return '!HYPHEN'
    elif word.isdigit() and len(word) == 4:
        return '!YEAR'
    elif word[0].isdigit():
        return '!DIGITS'
    else:
        return word.lower()

In [10]:
def process_data(sent_data,pos_data):
    words=[]
    feats=[]
    trgs=[]
    assert len(sent_data)==len(pos_data)
    for sent_seq,pos_seq in zip(sent_data,pos_data):
        sent_seq=bos+[normalize(word) for word in sent_seq]+eos
        pos_seq=bop+pos_seq+eop
        for idx,(word,pos)in enumerate(zip(sent_seq[2:-2],pos_seq[2:-2])):
            words.append(word)
            trgs.append(pos)
            feats.append(get_features(idx,word,sent_seq,pos_seq[idx-1],pos_seq[idx-2]))
    return words,feats,trgs

In [11]:
train_words,train_feats,train_trgs=process_data(train_sent,train_pos)
dev_words,dev_feats,dev_trgs=process_data(dev_sent,dev_pos)
test_words,test_feats,test_trgs=process_data(test_sent,test_pos)

In [11]:
from perceptron import AveragedPerceptron
import copy
model=AveragedPerceptron()
model.classes=pos_vocab

#for epoch in range(1,21):
for epoch in range(1,2):
    num_update=0
    train_cor=0
    for idx,(feats,trg)in enumerate(zip(train_feats,train_trgs)):
        guess=model.predict(feats)
        if guess != trg:
            num_update+=1
            model.update(trg,guess,feats)
        else:
            train_cor+=1
    #model.average_weights()
    #model.save('./normal-{:03d}.pkl'.format(epoch))
    
    dev_cor=0
    for idx,(feats,trg)in enumerate(zip(dev_feats,dev_trgs)):
        guess=model.predict(feats)
        if guess == trg:
            dev_cor+=1
    print("Epoch {} | num updates {} ({:.4f}%) | train cor {:.4f}% |dev cor {:.4f}%".format(
        epoch,num_update,num_update/len(train_feats) *100,train_cor/len(train_feats) *100,dev_cor/len(dev_feats) *100))
model.average_weights() 
  

Epoch 1 | num updates 79682 (8.3875%) | train cor 91.6125% |dev cor 93.4566%


In [25]:
model.save('model/plus-weighted.pkl')

TypeError: write() argument must be str, not bytes

In [20]:
model.load('./model/normal-weighted.pkl')
test_cor=0
for idx,(feats,trg)in enumerate(zip(test_feats,test_trgs)):
    guess=model.predict(feats)
    if guess == trg:
        test_cor+=1
print('Normal Test ACC {}%'.format(test_cor/len(test_feat)))

model.load('./plus-weighted.pkl')
test_cor=0
for idx,(word,feats,trg)in enumerate(zip(test_words,test_feats,test_trgs)):
    if word in single_pos_dict:
        guess=single_pos_dict[word]
    else:
        guess=model.predict(feats)
    if guess == trg:
        test_cor+=1
print('Plus Test ACC {}%'.format(test_cor/len(test_feat)))


TypeError: a bytes-like object is required, not 'str'

In [2]:
import os

''

In [7]:
class A(object):
    def __init__(self,name):
        self.name=name
    def print_(self):
        print(os.path.dirname(__file__))


In [8]:
a=A("A")

In [10]:
a.print_()

NameError: name '__file__' is not defined