In [1]:
import json
import pandas as pd
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bert_tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')
t5_tokenizer = AutoTokenizer.from_pretrained('megagonlabs/t5-base-japanese-web')

In [3]:
def print_scores(df_name:str,df):
	value_count = df['result'].value_counts()
	tp = value_count['tp']
	fp = value_count['fp']
	fn = value_count['fn']
	
	precision = tp/(tp + fp)
	recall = tp/(tp + fn + fp)
	
	f1 = 2*recall*precision/(recall + precision)
	print(f'DataFrame = {df_name}\nprecision : {precision} ({tp}/{tp + fp})\nrecall : {recall} ({tp}/{len(df)})\nf1 : {f1}\n')


In [15]:
def make_dfdict(decoded_path:str) ->dict:
	f = open(decoded_path,mode='r')
	df_psa = pd.read_json(f,orient='records',lines=True)
	df_psa['result'] = 'tn'
	for index,row in df_psa.iterrows():
		if(row['output'] == ''):
			if(row['arg_type'] == 'null'):
				continue
			else:
				df_psa.at[index,'result'] = 'fn'
		else:
			output_token = row['output'].translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) #半角文字を全角文字に変換
			gold_arguments = [gold.translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) for gold in row['gold_arguments']]

			# subword_gold_argument = [bert_tokenizer.tokenize(argument)[-1] for argument in gold_arguments]
			in_flag = False
			# if(bert_tokenizer.tokenize(output_token)[-1] in subword_gold_argument):
			# if(output_token in gold_arguments):

			# subword_gold_argument = [bert_tokenizer.tokenize(argument)[0].replace('#','').translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) for argument in gold_arguments]
			# output_sub_char = bert_tokenizer.tokenize(output_token)[0].replace('#','').translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)}))
			# if (output_sub_char in subword_gold_argument):
			# 	in_flag = True

			for gold_argument in gold_arguments:
				if(output_token in gold_argument):
					in_flag = True
			for gold_argument in gold_arguments:
				if(gold_argument in output_token):
					in_flag = True

			if(in_flag):
				df_psa.at[index,'result'] = 'tp'
			else:
				df_psa.at[index,'result'] = 'fp'
	
	df_dict = {
		'df_dep' : df_psa.query('arg_type == "dep"'),
		'df_intra' : df_psa.query('arg_type == "intra"'),
		'df_inter' : df_psa.query('arg_type == "inter"'),
		'df_dep_passive' : df_psa.query('arg_type == "dep" and alt_type == "passive"'),
		'df_intra_passive' : df_psa.query('arg_type == "intra" and alt_type == "passive"'),
		'df_inter_passive' : df_psa.query('arg_type == "inter" and alt_type == "passive"'),
		'df_exo' : df_psa.query('arg_type == "exog" or arg_type == "exo1" or arg_type == "exo2"'),
		# 'df_zero': df_psa.query('case_type != "null" and case_type != "exog" and case_type != "exo1" and case_type != "exo2" and case_type != "dep"'),
		'df_zero': df_psa.query('arg_type != "null" and arg_type != "dep"'),
	}
	
	return df_dict

In [None]:
juman_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_juman3.jsonl')

In [16]:
# mecab_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_closest.jsonl')
mecab_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/suffix.jsonl')

In [23]:
mecab_dfdict['df_exo'].query('result == "fp"')[0:50]

Unnamed: 0,output,gold_arguments,case_name,arg_type,predicate,alt_type,output_sentence,result
12,<extra_id_99>,[<extra_id_99>],ga,exog,決定さ,passive,<pad> <extra_id_99> が方針を決定さ</s> <pad> <pad>,fp
18,<extra_id_99>,[<extra_id_99>],ga,exog,行わ,passive,<pad> <extra_id_99> が転換を行わ</s> <pad> <pad> <pad>,fp
33,<extra_id_98>,[<extra_id_98>],ga,exo1,歓迎し,active,<pad> <extra_id_98> が方針を歓迎し</s> <pad>,fp
48,<extra_id_98>,[<extra_id_99>],ga,exog,いえる,active,<pad> <extra_id_98> がいえる</s> <pad> <pad>,fp
90,圏,[<extra_id_99>],ga,exog,膨れ上がった,active,<pad> 圏が膨れ上がった</s>,fp
114,<extra_id_99>,[<extra_id_99>],ga,exog,取って,active,<pad> <extra_id_99> が年を取って</s> <pad> <pad> <pad>,fp
117,<extra_id_99>,[<extra_id_99>],ga,exog,取り仕切る,active,<pad> <extra_id_99> が葬祭を取り仕切る</s>,fp
123,<extra_id_99>,[<extra_id_99>],ga,exog,離れる,active,<pad> <extra_id_99> が会社を離れる</s> <pad> <pad> <pad>,fp
126,<extra_id_99>,[<extra_id_99>],ga,exog,扱わ,passive,<pad> <extra_id_99> が<extra_id_99> を扱わ</s> <pad>,fp
127,<extra_id_99>,[<extra_id_99>],o,exog,扱わ,passive,<pad> <extra_id_99> が<extra_id_99> を扱わ</s> <pad>,fp


In [None]:
df_juman_inter = juman_dfdict['df_inter']
df_juman_inter = df_juman_inter.rename(columns={"result":"juman_result","output_token":"juman_output_token"})
df_juman_inter = df_juman_inter.loc[:,['juman_output_token','output_sentence','juman_result']]


In [None]:
df_mecab_inter = mecab_dfdict['df_inter']

In [None]:
df_merge = df_mecab_inter.join(df_juman_inter)

In [20]:
for df_name,df in mecab_dfdict.items():
	print_scores(df_name,df)

DataFrame = df_dep
precision : 0.9005078204346942 (22166/24615)
recall : 0.8651496819015652 (22166/25621)
f1 : 0.882474719324787

DataFrame = df_intra
precision : 0.6767730496453901 (3817/5640)
recall : 0.6302840158520475 (3817/6056)
f1 : 0.6527017783857729

DataFrame = df_inter
precision : 0.4704028809363043 (2090/4443)
recall : 0.4393525331091024 (2090/4757)
f1 : 0.45434782608695645

DataFrame = df_dep_passive
precision : 0.9153754469606674 (1536/1678)
recall : 0.8668171557562077 (1536/1772)
f1 : 0.8904347826086956

DataFrame = df_intra_passive
precision : 0.5829145728643216 (232/398)
recall : 0.5345622119815668 (232/434)
f1 : 0.5576923076923077

DataFrame = df_inter_passive
precision : 0.315035799522673 (132/419)
recall : 0.2907488986784141 (132/454)
f1 : 0.3024054982817869



KeyError: 'tp'