Permalink
Branch: master
Find file Copy path
executable file 248 lines (189 sloc) 10.4 KB
# coding: utf-8
from __future__ import print_function, unicode_literals
import codecs, subprocess, random
from collections import Counter
from itertools import islice
from nltk.tag import untag
from sklearn.model_selection import train_test_split
from hazm import *
from hazm.Chunker import tree2brackets
from hazm.PeykareReader import coarse_pos_e as peykare_coarse_pos_e
from hazm.DadeganReader import coarse_pos_e as dadegan_coarse_pos_e
def create_words_file(dic_file='resources/persian.dic', output='hazm/data/words.dat'):
""" prepares list of persian word words from [Virastyar](https://sourceforge.net/projects/virastyar/) dic file. """
dic_words = [line.strip().replace(', ', ',').split('\t') for line in codecs.open(dic_file, encoding='utf-8') if len(line.strip().split('\t')) == 3]
dic_words = filter(lambda item: not item[2].startswith('V') and 'NEG' not in item[2], dic_words)
dic_words = ['\t'.join(item) for item in sorted(dic_words, key=lambda item: item[0])]
print(*dic_words, sep='\n', file=codecs.open(output, 'w', 'utf-8'))
print(output, 'created')
def evaluate_lemmatizer(conll_file='resources/train.conll', peykare_root='corpora/peykare'):
lemmatizer = Lemmatizer()
errors = []
with codecs.open('resources/lemmatizer_errors.txt', 'w', 'utf8') as output:
dadegan = DadeganReader(conll_file)
for tree in dadegan.trees():
for node in tree.nodelist[1:]:
word, lemma, pos = node['word'], node['lemma'], node['mtag']
if lemmatizer.lemmatize(word, pos) != lemma:
errors.append((word, lemma, pos, lemmatizer.lemmatize(word, pos)))
print(len(errors), 'errors', file=output)
counter = Counter(errors)
for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True):
print(count, *item, file=output)
missed = []
with codecs.open('resources/lemmatizer_missed.txt', 'w', 'utf8') as output:
peykare = PeykareReader(peykare_root)
for sentence in peykare.sents():
for word in sentence:
if word[1] == 'V':
if word[0] == lemmatizer.lemmatize(word[0]):
missed.append(word[0])
print(len(missed), 'missed', file=output)
counter = Counter(missed)
for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True):
print(count, item, file=output)
def evaluate_normalizer(tnews_root='corpora/tnews'):
tnews = TNewsReader(root=tnews_root)
normalizer = Normalizer(persian_style=False, persian_numbers=False, remove_diacritics=False, token_based=False, affix_spacing=True)
token_normalizer = Normalizer(persian_style=False, persian_numbers=False, remove_diacritics=False, token_based=True, affix_spacing=False)
with codecs.open('resources/normalized.txt', 'w', 'utf8') as output1, codecs.open('resources/normalized_token_based.txt', 'w', 'utf8') as output2:
random.seed(0)
for text in tnews.texts():
if random.randint(0, 100) != 0:
continue
for sentence in sent_tokenize(text):
print(normalizer.normalize(sentence), '\n', file=output1)
print(token_normalizer.normalize(sentence), '\n', file=output2)
def evaluate_informal_normalizer(sentipars_root='corpora/sentipers'):
sentipers = SentiPersReader(root=sentipars_root)
normalizer = Normalizer()
informal_normalizer = InformalNormalizer()
output = codecs.open('resources/normalized.txt', 'w', 'utf8')
for comments in sentipers.comments():
for comment in comments:
for sentence in comment:
print(normalizer.normalize(sentence), file=output)
sents = informal_normalizer.normalize(sentence)
sents = [[word[0] for word in sent] for sent in sents]
sents = [' '.join(sent) for sent in sents]
text = '\n'.join(sents)
text = normalizer.normalize(text)
print(text, file=output)
print(file=output)
def evaluate_chunker(treebank_root='corpora/treebank'):
treebank = TreebankReader(treebank_root, join_clitics=True, join_verb_parts=True)
chunker = Chunker()
chunked_trees = list(treebank.chunked_trees())
print(chunker.evaluate(chunked_trees))
output = codecs.open('resources/chunker_errors.txt', 'w', 'utf8')
for sentence, gold in zip(treebank.sents(), chunked_trees):
chunked = chunker.parse(sentence)
if chunked != gold:
print(tree2brackets(chunked), file=output)
print(tree2brackets(gold), file=output)
print(file=output)
def train_postagger(peykare_root='corpora/peykare', model_file='resources/postagger.model', test_size=.1, sents_limit=None, pos_map=peykare_coarse_pos_e):
tagger = POSTagger(type='crf', algo='rprop', compact=True, patterns=[
'*',
'u:wll=%x[-2,0]',
'u:wl=%x[-1,0]',
'u:w=%x[0,0]',
'u:wr=%x[1,0]',
'u:wrr=%x[2,0]',
# 'u:w2l=%x[-1,0]/%x[0,0]',
# 'u:w2r=%x[0,0]/%x[1,0]',
'*:p1=%m[0,0,"^.?"]',
'*:p2=%m[0,0,"^.?.?"]',
'*:p3=%m[0,0,"^.?.?.?"]',
'*:s1=%m[0,0,".?$"]',
'*:s2=%m[0,0,".?.?$"]',
'*:s3=%m[0,0,".?.?.?$"]',
'*:p?l=%t[-1,0,"\p"]',
'*:p?=%t[0,0,"\p"]',
'*:p?r=%t[1,0,"\p"]',
'*:p?a=%t[0,0,"^\p*$"]',
'*:n?l=%t[-1,0,"\d"]',
'*:n?=%t[0,0,"\d"]',
'*:n?r=%t[1,0,"\d"]',
'*:n?a=%t[0,0,"^\d*$"]',
])
peykare = PeykareReader(peykare_root, pos_map=pos_map)
train_sents, test_sents = train_test_split(list(islice(peykare.sents(), sents_limit)), test_size=test_size, random_state=0)
tagger.train(train_sents)
tagger.save_model(model_file)
print(tagger.evaluate(test_sents))
def train_chunker(train_file='corpora/train.conll', dev_file='corpora/dev.conll', test_file='corpora/test.conll', model_file='resources/chunker.model'):
tagger = POSTagger(model='resources/postagger.model')
chunker = Chunker(type='crf', algo='l-bfgs', compact=True, patterns=[
'*',
'u:wll=%x[-2,0]',
'u:wl=%x[-1,0]',
'u:w=%x[0,0]',
'u:wr=%x[1,0]',
'u:wrr=%x[2,0]',
'*:tll=%x[-2,1]',
'*:tl=%x[-1,1]',
'*:t=%x[0,1]',
'*:tr=%x[1,1]',
'*:trr=%x[2,1]',
])
def retag_trees(trees, sents):
for tree, sentence in zip(trees, tagger.tag_sents(map(untag, sents))):
for (n, word) in zip(tree.treepositions('leaves'), sentence):
tree[n] = word
train, test = DadeganReader(train_file), DadeganReader(test_file)
train_trees = list(train.chunked_trees())
retag_trees(train_trees, train.sents())
chunker.train(train_trees)
chunker.save_model(model_file)
test_trees = list(test.chunked_trees())
retag_trees(test_trees, test.sents())
print(chunker.evaluate(test_trees))
def train_maltparser(train_file='corpora/train.conll', dev_file='corpora/dev.conll', test_file='corpora/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/malt-options.xml', features_file='resources/malt-features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'):
lemmatizer, tagger = Lemmatizer(), POSTagger(model='resources/postagger.model')
train, test = DadeganReader(train_file), DadeganReader(test_file)
train_data = train_file +'.data'
with codecs.open(train_data, 'w', 'utf8') as output:
for tree, sentence in zip(train.trees(), tagger.tag_sents(map(untag, train.sents()))):
for i, (node, word) in enumerate(zip(list(tree.nodes.values())[1:], sentence), start=1):
node['mtag'] = word[1]
node['lemma'] = lemmatizer.lemmatize(node['word'], node['mtag'])
print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['mtag'], node['mtag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output)
print(file=output)
subprocess.Popen(['java', memory_min, memory_max, '-jar', path_to_jar, '-w', 'resources', '-c', model_file, '-i', train_data, '-f', options_file, '-F', features_file, '-m', 'learn']).wait()
# evaluation
parser = MaltParser(tagger=tagger, lemmatizer=lemmatizer, model_file=model_file)
parsed_trees = parser.parse_sents(map(untag, test.sents()))
test_data, test_results = test_file +'.data', test_file +'.results'
print('\n'.join([tree.to_conll(10) for tree in test.trees()]).strip(), file=codecs.open(test_data, 'w', 'utf8'))
print('\n'.join([tree.to_conll(10) for tree in parsed_trees]).strip(), file=codecs.open(test_results, 'w', 'utf8'))
subprocess.Popen(['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results]).wait()
def train_turboparser(train_file='corpora/train.conll', dev_file='corpora/dev.conll', test_file='corpora/test.conll', model_file='resources/turboparser.model'):
lemmatizer, tagger = Lemmatizer(), POSTagger(model='resources/postagger.model')
train, test = DadeganReader(train_file), DadeganReader(test_file)
train_data = train_file +'.data'
with codecs.open(train_data, 'w', 'utf8') as output:
for tree, sentence in zip(train.trees(), tagger.tag_sents(map(untag, train.sents()))):
for i, (node, word) in enumerate(zip(list(tree.nodes.values())[1:], sentence), start=1):
node['mtag'] = word[1]
node['lemma'] = lemmatizer.lemmatize(node['word'], node['mtag'])
print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['mtag'], node['mtag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output)
print(file=output)
subprocess.Popen(['./resources/TurboParser', '--train', '--file_train='+train_data, '--file_model='+model_file, '--logtostderr']).wait()
# evaluation
parser = TurboParser(tagger=tagger, lemmatizer=lemmatizer, model_file=model_file)
parsed_trees = parser.parse_sents(map(untag, test.sents()))
test_data, test_results = test_file +'.data', test_file +'.results'
print('\n'.join([tree.to_conll(10) for tree in test.trees()]).strip(), file=codecs.open(test_data, 'w', 'utf8'))
print('\n'.join([tree.to_conll(10) for tree in parsed_trees]).strip(), file=codecs.open(test_results, 'w', 'utf8'))
subprocess.Popen(['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results, '--pattern', '0.####', '--Metric', 'LAS;UAS']).wait()
def train_stanford_postagger(peykare_root='corpora/peykare', path_to_model='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar', properties_file='resources/stanford-postagger.props', memory_min='-Xms1g', memory_max='-Xmx6g', test_size=.1, pos_map=peykare_coarse_pos_e):
peykare = PeykareReader(peykare_root, pos_map=pos_map)
train_file = 'resources/tagger_train_data.txt'
train, test = train_test_split(list(peykare.sents()), test_size=test_size, random_state=0)
output = codecs.open(train_file, 'w', 'utf8')
for sentence in train:
print(*(map(lambda w: '/'.join(w).replace(' ', '_'), sentence)), file=output)
subprocess.Popen(['java', memory_min, memory_max, '-classpath', path_to_jar, 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-prop', properties_file, '-model', path_to_model, '-trainFile', train_file, '-tagSeparator', '/', '-search', 'owlqn2']).wait()
tagger = StanfordPOSTagger(path_to_jar=path_to_jar, path_to_model=path_to_model)
print(tagger.evaluate(test))