From 67a9ed4bda82a47db60e123289adffa3b522ce07 Mon Sep 17 00:00:00 2001 From: Anindya Roy Date: Mon, 27 Jan 2014 04:11:45 +0100 Subject: [PATCH] Added CRF approach. --- README | 3 ++- scripts/input1.tkn.wseg1 | 2 +- scripts/runExperiments.py | 5 ++++- scripts/vn_tokens_evaluate.py | 10 +++++----- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/README b/README index beb6376..0581c3b 100644 --- a/README +++ b/README @@ -1,4 +1,5 @@ Project: Vietnamese tokenization. +Phase I: Maximum Matching algorithm. Anindya Roy, 22-1-14. @@ -116,7 +117,7 @@ Note that for the cross-validation experiments, for each run, the train file of There are three main scripts in this repository in the /scripts/ folder. -* vn_tokenizer.py - I wrote this script to implement the enhanced Maximum Matching algorithm (MM+) to tokenize raw Vietnamese text. In the output, the tokens are indicated by being delimited by square brackets []. Please run this script from within the /scripts/ folder. Usage directions may be obtained by trying to run the script without arguments. In brief, the syntax is: +* vn_tokenizer.py - I wrote this script to implement the enhanced Maximum Matching algorithm (MM+) to tokenize raw Vietnamese text. In the output, the tokens are indicated by being delimited by square brackets []. Please run this script from within the /scripts/ folder. Usage directions may be obtained by running the script without arguments. In brief, the syntax is: ./vn_tokenizer.py diff --git a/scripts/input1.tkn.wseg1 b/scripts/input1.tkn.wseg1 index 08c80d6..8692b0c 100644 --- a/scripts/input1.tkn.wseg1 +++ b/scripts/input1.tkn.wseg1 @@ -7,4 +7,4 @@ [VeriSign] [phải] [khởi động] [dự án] [mang] [tên] [Titan] [để] [mở rộng] [dung lượng] [của] [hệ thống] , [đáp ứng] [được] [4] [nghìn] [tỷ] [thắc mắc/ngày] [vào] [năm] [2010] . [Tăng] [dung lượng] [máy chủ] [cũng] [là] [một] [biện pháp] [đối phó] [với] [nguy cơ] [tấn công] [từ chối] [dịch vụ] . [Nếu] [để] [xảy ra] [tình trạng] [này] , [toàn bộ] [các] [trang] [mà] [VeriSign] [quản lý] [sẽ] [``] [chết đứng] [''] . -[T. H.] ( [theo] [AP] ) [T. H.] [Nhân] +[T.] [H.] ( [theo] [AP] ) [T.] [H.] [Nhân] diff --git a/scripts/runExperiments.py b/scripts/runExperiments.py index 5ab8df0..3a1f816 100755 --- a/scripts/runExperiments.py +++ b/scripts/runExperiments.py @@ -28,7 +28,7 @@ USE_MM = 0 # Use Maximum Matching (MM) algorithm. USE_ENHANCED_MM = 1 # Use Enhanced MM algorithm -DIR0 = '../' # .../Roy_VnTokenizer/. +DIR0 = '../' # Top level directory: Roy_VnTokenizer/. # Values over cross-validation runs. P_ = 0 # Precision. @@ -136,6 +136,7 @@ line = re.sub('@@', '', line) if line in ['Proverb', 'Idiom']: del words_lex[-1] # Remove last word added. + f.close() # Reading locations file. if USE_LOCATIONS_FILE: @@ -144,6 +145,7 @@ for line in f: word = ' '.join(line.split()) words_lex.append(word) + f.close() # Reading person names file. if USE_PERSONNAMES_FILE: @@ -152,6 +154,7 @@ for line in f: word = ' '.join(line.split()) words_lex.append(word) + f.close() if 1: words_lex = list(set(words_lex)) diff --git a/scripts/vn_tokens_evaluate.py b/scripts/vn_tokens_evaluate.py index e2fbf30..5fb34a1 100755 --- a/scripts/vn_tokens_evaluate.py +++ b/scripts/vn_tokens_evaluate.py @@ -18,13 +18,13 @@ print '\n' print "Vietnamese tokenizer evaluation. Version 0.1." print "=============================================" - print "Usage: ./vn_tokens_evaluate.py " - print " and are mandatory inputs." + print "Usage: ./vn_tokens_evaluate.py " + print " and are mandatory inputs." print "The hyp file must contain hypothesized tokenization." print "The ref file must contain reference tokenization." print "All tokens must be surrounded by square brackets []." + '\n' exit() - if n_args == 3: + if n_args >= 3: input_file_name = sys.argv[1] output_file_name = sys.argv[2] @@ -36,11 +36,11 @@ import os if not os.path.isfile(input_file_name): - print 'hyp file "' + input_file_name + '" does not exist. Retry with a valid file name.' + print 'ref file "' + input_file_name + '" does not exist. Retry with a valid file name.' exit(1) if not os.path.isfile(output_file_name): - print 'ref file "' + output_file_name + '" does not exist. Retry with a valid file name.' + print 'hyp file "' + output_file_name + '" does not exist. Retry with a valid file name.' exit(1)