Added CRF approach.

roy-a · Jan 27, 2014 · 67a9ed4 · 67a9ed4
1 parent cb408ab
commit 67a9ed4
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 8 deletions.
diff --git a/README b/README
@@ -1,4 +1,5 @@
 Project: Vietnamese tokenization.
+Phase I: Maximum Matching algorithm.
 Anindya Roy,
 22-1-14.
 
@@ -116,7 +117,7 @@ Note that for the cross-validation experiments, for each run, the train file of
 
 There are three main scripts in this repository in the /scripts/ folder.
 
-* vn_tokenizer.py - I wrote this script to implement the enhanced Maximum Matching algorithm (MM+) to tokenize raw Vietnamese text. In the output, the tokens are indicated by being delimited by square brackets []. Please run this script from within the /scripts/ folder. Usage directions may be obtained by trying to run the script without arguments. In brief, the syntax is:
+* vn_tokenizer.py - I wrote this script to implement the enhanced Maximum Matching algorithm (MM+) to tokenize raw Vietnamese text. In the output, the tokens are indicated by being delimited by square brackets []. Please run this script from within the /scripts/ folder. Usage directions may be obtained by running the script without arguments. In brief, the syntax is:
 
 ./vn_tokenizer.py <input file name> <output file name> <model file name>
 

diff --git a/scripts/input1.tkn.wseg1 b/scripts/input1.tkn.wseg1
@@ -7,4 +7,4 @@
 [VeriSign] [phải] [khởi động] [dự án] [mang] [tên] [Titan] [để] [mở rộng] [dung lượng] [của] [hệ thống] , [đáp ứng] [được] [4] [nghìn] [tỷ] [thắc mắc/ngày] [vào] [năm] [2010] .
 [Tăng] [dung lượng] [máy chủ] [cũng] [là] [một] [biện pháp] [đối phó] [với] [nguy cơ] [tấn công] [từ chối] [dịch vụ] .
 [Nếu] [để] [xảy ra] [tình trạng] [này] , [toàn bộ] [các] [trang] [mà] [VeriSign] [quản lý] [sẽ] [``] [chết đứng] [''] .
-[T. H.] ( [theo] [AP] ) [T. H.] [Nhân]
+[T.] [H.] ( [theo] [AP] ) [T.] [H.] [Nhân]
diff --git a/scripts/runExperiments.py b/scripts/runExperiments.py
@@ -28,7 +28,7 @@
 USE_MM = 0 # Use Maximum Matching (MM) algorithm.
 USE_ENHANCED_MM = 1 # Use Enhanced MM algorithm
 
-DIR0 = '../' # .../Roy_VnTokenizer/.
+DIR0 = '../' # Top level directory: Roy_VnTokenizer/.
 
 # Values over cross-validation runs.
 P_ = 0 # Precision.
@@ -136,6 +136,7 @@
 			line = re.sub('@@', '', line)
 			if line in ['Proverb', 'Idiom']:
 				del words_lex[-1] # Remove last word added.
+	f.close()
 
     # Reading locations file.
     if USE_LOCATIONS_FILE:
@@ -144,6 +145,7 @@
 	for line in f:
 		word = ' '.join(line.split())
 		words_lex.append(word)
+	f.close()
 
     # Reading person names file.
     if USE_PERSONNAMES_FILE:
@@ -152,6 +154,7 @@
 	for line in f:
 		word = ' '.join(line.split())
 		words_lex.append(word)
+	f.close()
 
     if 1:
 	words_lex = list(set(words_lex))

diff --git a/scripts/vn_tokens_evaluate.py b/scripts/vn_tokens_evaluate.py
@@ -18,13 +18,13 @@
 		print '\n'
 		print "Vietnamese tokenizer evaluation. Version 0.1."
 		print "============================================="
-		print "Usage: ./vn_tokens_evaluate.py <hyp file name> <ref file name>"
-		print "<hyp file name> and <ref file name> are mandatory inputs."
+		print "Usage: ./vn_tokens_evaluate.py <ref file name> <hyp file name>"
+		print "<ref file name> and <hyp file name> are mandatory inputs."
 		print "The hyp file must contain hypothesized tokenization."
 		print "The ref file must contain reference tokenization."
 		print "All tokens must be surrounded by square brackets []." + '\n'
 		exit()
-	if n_args == 3:
+	if n_args >= 3:
 		input_file_name = sys.argv[1]
 		output_file_name = sys.argv[2]
 
@@ -36,11 +36,11 @@
 
 import os
 if not os.path.isfile(input_file_name):
-	print 'hyp file "' + input_file_name + '" does not exist. Retry with a valid file name.'
+	print 'ref file "' + input_file_name + '" does not exist. Retry with a valid file name.'
 	exit(1)
 
 if not os.path.isfile(output_file_name):
-	print 'ref file "' + output_file_name + '" does not exist. Retry with a valid file name.'
+	print 'hyp file "' + output_file_name + '" does not exist. Retry with a valid file name.'
 	exit(1)