In [1]:
from preprocessing import *
import numpy as np
from sklearn import metrics

In [2]:
%%bash
dpath='NL2SparQL4NLU/dataset/NL2SparQL4NLU'
spath='dataset'
mkdir -p $spath

cp $dpath.train.utterances.txt $spath/trn.txt
cp $dpath.test.utterances.txt $spath/tst.txt

cp $dpath.train.conll.txt $spath/trn.conll
cp $dpath.test.conll.txt $spath/tst.conll

cp $dpath.train.features.conll.txt $spath/trn.feature.conll
cp $dpath.test.features.conll.txt $spath/tst.feature.conll

## NLTK Concept Tagging

In [3]:
from nltk.corpus.reader.conll import ConllChunkCorpusReader
import nltk.tag.hmm as hmm

from nltk.tag import NgramTagger 

from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

In [4]:
def get_chunks(corpus_file, fs="\t"):
    sents = read_corpus_conll(corpus_file, fs=fs)
    return set([token[-1] for sent in sents for token in sent if token[-1] != 'O'])

### IOB-Tags HMM

In [11]:
trn='dataset/trn.conll'

concepts = sorted(get_chunks(trn))

trn_data = ConllChunkCorpusReader('dataset/',  r'trn.conll', concepts)
tst_data = ConllChunkCorpusReader('dataset/',  r'tst.conll', concepts)

# training hmm on training data
hmm_model = hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_model.train(trn_data.tagged_sents())

# getting references
refs = [str(t[1][2:]) for s in tst_data.tagged_sents() for t in s]

# getting hypotheses
hyps = [str(t[1][2:]) for s in tst_data.sents() for t in hmm_tagger.tag(s)]

# print scores
tags = [c[2:] for c in concepts]

accuracy = hmm_tagger.evaluate(tst_data.tagged_sents())
print("Accuracy: {:6.3f}".format(accuracy*100))

f1 = metrics.f1_score(refs, hyps, average="weighted", labels=np.unique(tags), zero_division=0)
print("F1 score: {:6.3f}".format(f1*100))

Accuracy: 90.867
F1 score: 77.470


### IOB-Tags NgramTagger

In [12]:
def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
    if not backoff:
        backoff = tagger_classes[0](tagged_sents)
        del tagger_classes[0]
 
    for cls in tagger_classes:
        tagger = cls(tagged_sents, backoff=backoff)
        backoff = tagger
 
    return backoff

In [16]:
ngram = [1,2,3]
cutoff = [0,1]
for co in cutoff:
    for ngo in ngram:
        nm = NgramTagger(ngo, trn_data.tagged_sents(), cutoff=co)
        acc = nm.evaluate(tst_data.tagged_sents())
        
        refs = [str(t[1][2:]) for s in tst_data.tagged_sents() for t in s] 
        hyps = [str(t[1][2:]) if t[1] is not None else str(t[1]) for s in tst_data.sents() for t in nm.tag(s)]
        
        f1 = metrics.f1_score(refs, hyps, average="weighted", labels=np.unique(tags), zero_division=0)
        
        print("Cutoff: {}\tNgram: {}\tAcc: {:6.2f}\tF1 : {:6.2f}".format(co,ngo,acc*100,f1*100))  
    print("\n")

backoff = DefaultTagger('O')
tag = backoff_tagger(trn_data.tagged_sents(),[UnigramTagger, BigramTagger, TrigramTagger],backoff=backoff)
acc = tag.evaluate(tst_data.tagged_sents())
refs = [str(t[1][2:]) for s in tst_data.tagged_sents() for t in s]
hyps = [str(t[1][2:]) if t[1] is not None else str(t[1]) for s in tst_data.sents() for t in tag.tag(s)]
f1 = metrics.f1_score(refs, hyps, average="weighted", labels=np.unique(tags), zero_division=0)

print("\t\tCombined:\tAcc: {:6.2f}\tF1 : {:6.2f}".format(acc*100,f1*100))

Cutoff: 0	Ngram: 1	Acc:  87.62	F1 :  74.03
Cutoff: 0	Ngram: 2	Acc:  77.98	F1 :  70.07
Cutoff: 0	Ngram: 3	Acc:  75.48	F1 :  68.06


Cutoff: 1	Ngram: 1	Acc:  85.72	F1 :  71.44
Cutoff: 1	Ngram: 2	Acc:  71.59	F1 :  62.91
Cutoff: 1	Ngram: 3	Acc:  68.84	F1 :  60.47


		Combined:	Acc:  90.97	F1 :  78.76


### POS-Tags HMM

In [20]:
trn='dataset/trn.feature.conll'

concepts = sorted(get_chunks(trn))

trn_data = ConllChunkCorpusReader('dataset/',  r'trn.feature.conll', concepts)
tst_data = ConllChunkCorpusReader('dataset/',  r'tst.feature.conll', concepts)

# training hmm on training data
hmm_model = hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_model.train(trn_data.tagged_sents())

# getting references
refs = [str(t[1]) for s in tst_data.tagged_sents() for t in s]

# getting hypotheses
hyps = [str(t[1]) for s in tst_data.sents() for t in hmm_tagger.tag(s)]

# print scores
special = ["''", ':', None]
tags = [c for c in concepts if c not in special]

accuracy = hmm_tagger.evaluate(tst_data.tagged_sents())
print("Accuracy: {:6.3f}".format(accuracy*100))

f1 = metrics.f1_score(refs, hyps, average='weighted', labels=np.unique(tags), zero_division=0)
print("F1 score: {:6.3f}".format(f1*100))

Accuracy: 84.881
F1 score: 87.379


### POS-Tags NgramTagger

In [21]:
ngram = [1,2,3]
cutoff = [0,1]
for co in cutoff:
    for ngo in ngram:
        nm = NgramTagger(ngo, trn_data.tagged_sents(), cutoff=co)
        acc = nm.evaluate(tst_data.tagged_sents())
        
        refs = [str(t[1]) for s in tst_data.tagged_sents() for t in s] 
        hyps = [str(t[1]) if t[1] is not None else str(t[1]) for s in tst_data.sents() for t in nm.tag(s)]
        
        f1 = metrics.f1_score(refs, hyps, average="weighted", labels=np.unique(tags), zero_division=0)

        print("Cutoff: {}\tNgram: {}\tAcc: {:6.2f}\tF1 : {:6.2f}".format(co,ngo,acc*100,f1*100))  
    print("\n")

backoff = DefaultTagger('NN')
tag = backoff_tagger(trn_data.tagged_sents(),[UnigramTagger, BigramTagger, TrigramTagger],backoff=backoff)
acc = tag.evaluate(tst_data.tagged_sents())
refs = [str(t[1]) for s in tst_data.tagged_sents() for t in s]
hyps = [str(t[1]) if t[1] is not None else str(t[1]) for s in tst_data.sents() for t in tag.tag(s)]
f1 = metrics.f1_score(refs, hyps, average="weighted", labels=np.unique(tags), zero_division=0)

print("\t\tCombined:\tAcc: {:6.2f}\tF1 : {:6.2f}".format(acc*100,f1*100))

Cutoff: 0	Ngram: 1	Acc:  89.45	F1 :  90.62
Cutoff: 0	Ngram: 2	Acc:  70.37	F1 :  80.19
Cutoff: 0	Ngram: 3	Acc:  58.07	F1 :  71.00


Cutoff: 1	Ngram: 1	Acc:  87.42	F1 :  89.53
Cutoff: 1	Ngram: 2	Acc:  58.35	F1 :  71.03
Cutoff: 1	Ngram: 3	Acc:  49.05	F1 :  62.67


		Combined:	Acc:  93.31	F1 :  93.08
