In [2]:
import json
import pandas as pd
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
bert_tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')

In [4]:
def print_scores(df_name:str,df):
	value_count = df['result'].value_counts()
	tp = value_count['tp']
	fp = value_count['fp']
	fn = value_count['fn']
	
	precision = tp/(tp + fp)
	recall = tp/(tp + fn + fp)
	
	f1 = 2*recall*precision/(recall + precision)
	print(f'DataFrame = {df_name}\nprecision : {precision} ({tp}/{tp + fp})\nrecall : {recall} ({tp}/{len(df)})\nf1 : {f1}\n')


In [5]:
def make_dfdict(decoded_path:str) ->dict:
	f = open(decoded_path,mode='r')
	df_psa = pd.read_json(f,orient='records',lines=True)
	df_psa['result'] = 'tn'
	for index,row in df_psa.iterrows():
		if(row['output_token'] == ''):
			if(row['case_type'] == 'null'):
				continue
			else:
				df_psa.at[index,'result'] = 'fn'
		else:
			output_token = row['output_token'].translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) #半角文字を全角文字に変換
			gold_arguments = [gold.translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) for gold in row['gold_arguments']]
			subword_gold_argument = [bert_tokenizer.tokenize(argument)[-1] for argument in gold_arguments]
			if(bert_tokenizer.tokenize(output_token)[-1] in subword_gold_argument):
			# if(output_token in gold_arguments):
			# subword_gold_argument = [bert_tokenizer.tokenize(argument)[-1].replace('#','') for argument in gold_arguments]
			# output_sub_char = bert_tokenizer.tokenize(output_token)[-1].replace('#','').translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)}))
			# in_flag = False
			# for arg_sub_char in gold_arguments:
			# 	if (output_sub_char in arg_sub_char):
			# 		in_flag = True
			# if(in_flag):
				df_psa.at[index,'result'] = 'tp'
			else:
				df_psa.at[index,'result'] = 'fp'
	
	df_dict = {
		'df_dep' : df_psa.query('case_type == "dep"'),
		'df_intra' : df_psa.query('case_type == "intra"'),
		'df_inter' : df_psa.query('case_type == "inter"'),
		'df_dep_passive' : df_psa.query('case_type == "dep" and alt_type == "passive"'),
		'df_intra_passive' : df_psa.query('case_type == "intra" and alt_type == "passive"'),
		'df_inter_passive' : df_psa.query('case_type == "inter" and alt_type == "passive"'),
		'df_exo' : df_psa.query('case_type == "exog" or case_type == "exo1" or case_type == "exo2"'),
		# 'df_zero': df_psa.query('case_type != "null" and case_type != "exog" and case_type != "exo1" and case_type != "exo2" and case_type != "dep"'),
		'df_zero': df_psa.query('case_type != "null" and case_type != "dep"'),
	}
	
	return df_dict

In [None]:
juman_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_juman3.jsonl')

In [6]:
# mecab_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_closest.jsonl')
mecab_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_singleArg.jsonl')

In [9]:
mecab_dfdict['df_dep'].query('result == "fp"')[0:50]

Unnamed: 0,output_token,gold_arguments,case_name,case_type,predicate,alt_type,output_sentence,input_tokens,result
3,<extra_id_98>,[鳥],ga,dep,目立つ,active,<pad> <extra_id_98> が目立つ</s> <pad> <pad> <pad>,が格:東京・上野の不忍池で、無残な姿の鳥が<extra_id_0>目立つ<extra_id_...,fp
10,糸,[釣り糸],o,dep,引っ掛けて,active,<pad> 糸を引っ掛けて</s>,を格:東京・上野の不忍池で、無残な姿の鳥が目立つ。片足が切れたユリカモメ釣り糸を<extra...,fp
78,支援,[企業],ga,dep,目指す,active,<pad> 支援が目指す</s>,が格:村山連立政権の最重要政策課題になっている特殊法人の整理・合理化で、通産省は日本貿易振興...,fp
189,両党,[会],ga,dep,検討する,active,<pad> 両党が検討する</s>,が格:村山富市首相とさきがけ代表の武村正義蔵相による社会・さきがけ党首会談が、外遊中の武村蔵...,fp
270,省,[航空],ga,dep,申請さ,passive,<pad> 省が申請さ</s>,が格:一九五二年以来の不平等が続いている「日米航空協定」の平等化を実現するため、政府が米側に...,fp
398,もえ,[三つどもえ],ni,dep,なり,active,<pad> つどもえになり</s>,に格:社会党を離党した伊東秀子衆院議員は十三日、札幌市で記者会見し、今春の北海道知事選への立...,fp
402,お年玉,[平均],ga,dep,円,active,<pad> お年玉が円</s>,が格:今年の正月、東京都内の小学４—６年生がもらったお年玉は平均２万７１７１<extra_i...,fp
432,行,"[銀行, 同行]",ga,dep,歓迎して,active,<pad> 同行が歓迎して</s> <pad>,が格:今年の正月、東京都内の小学４—６年生がもらったお年玉は平均２万７１７１円で、前年比アッ...,fp
471,用品,[品],ga,dep,登場,active,<pad> 用品が登場</s> <pad> <pad>,が格:今年の正月、東京都内の小学４—６年生がもらったお年玉は平均２万７１７１円で、前年比アッ...,fp
477,額,[円],ga,dep,する,active,<pad> 額がする</s> <pad> <pad>,が格:今年の正月、東京都内の小学４—６年生がもらったお年玉は平均２万７１７１円で、前年比アッ...,fp


In [None]:
df_juman_inter = juman_dfdict['df_inter']
df_juman_inter = df_juman_inter.rename(columns={"result":"juman_result","output_token":"juman_output_token"})
df_juman_inter = df_juman_inter.loc[:,['juman_output_token','output_sentence','juman_result']]


In [None]:
df_mecab_inter = mecab_dfdict['df_inter']

In [None]:
df_merge = df_mecab_inter.join(df_juman_inter)

In [7]:
for df_name,df in mecab_dfdict.items():
	print_scores(df_name,df)

DataFrame = df_dep
precision : 0.9366436781609195 (20372/21750)
recall : 0.7862904782122042 (20372/25909)
f1 : 0.8549067332508026

DataFrame = df_intra
precision : 0.7300638780135998 (3543/4853)
recall : 0.5746958637469587 (3543/6165)
f1 : 0.6431294245779634

DataFrame = df_inter
precision : 0.44265981368454865 (1378/3113)
recall : 0.33667236745663326 (1378/4093)
f1 : 0.38245906189286705

DataFrame = df_dep_passive
precision : 0.9090291921249152 (1339/1473)
recall : 0.7704257767548907 (1339/1738)
f1 : 0.8340080971659919

DataFrame = df_intra_passive
precision : 0.6176470588235294 (252/408)
recall : 0.4931506849315068 (252/511)
f1 : 0.5484221980413493

DataFrame = df_inter_passive
precision : 0.27899686520376177 (89/319)
recall : 0.21654501216545013 (89/411)
f1 : 0.2438356164383562

DataFrame = df_exo
precision : 0.6684474123539232 (2002/2995)
recall : 0.5232618923157345 (2002/3826)
f1 : 0.5870107022430728

DataFrame = df_zero
precision : 0.6316029559346775 (6923/10961)
recall : 0.49155