In [1]:
import json
import pandas as pd
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bert_tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')

In [3]:
def print_scores(df_name:str,df):
	value_count = df['result'].value_counts()
	tp = value_count['tp']
	fp = value_count['fp']
	fn = value_count['fn']
	
	precision = tp/(tp + fp)
	recall = tp/(tp + fn + fp)
	
	f1 = 2*recall*precision/(recall + precision)
	print(f'DataFrame = {df_name}\nprecision : {precision} ({tp}/{tp + fp})\nrecall : {recall} ({tp}/{len(df)})\nf1 : {f1}\n')


In [7]:
def make_dfdict(decoded_path:str) ->dict:
	f = open(decoded_path,mode='r')
	df_psa = pd.read_json(f,orient='records',lines=True)
	df_psa['result'] = 'tn'
	for index,row in df_psa.iterrows():
		if(row['output_token'] == ''):
			if(row['case_type'] == 'null'):
				continue
			else:
				df_psa.at[index,'result'] = 'fn'
		else:
			output_token = row['output_token'].translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) #半角文字を全角文字に変換
			gold_arguments = [gold.translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) for gold in row['gold_arguments']]
			# subword_gold_argument = [bert_tokenizer.tokenize(argument)[-1] for argument in gold_arguments]
			# if(bert_tokenizer.tokenize(output_token)[-1] in subword_gold_argument):
			# if(output_token in gold_arguments):
			subword_gold_argument = [bert_tokenizer.tokenize(argument)[-1].replace('#','') for argument in gold_arguments]
			output_sub_char = bert_tokenizer.tokenize(output_token)[-1].replace('#','').translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)}))
			in_flag = False
			for arg_sub_char in gold_arguments:
				if (output_sub_char in arg_sub_char):
					in_flag = True
			if(in_flag):
				df_psa.at[index,'result'] = 'tp'
			else:
				df_psa.at[index,'result'] = 'fp'
	
	df_dict = {
		'df_dep' : df_psa.query('case_type == "dep"'),
		'df_intra' : df_psa.query('case_type == "intra"'),
		'df_inter' : df_psa.query('case_type == "inter"'),
		'df_dep_passive' : df_psa.query('case_type == "dep" and alt_type == "passive"'),
		'df_intra_passive' : df_psa.query('case_type == "intra" and alt_type == "passive"'),
		'df_inter_passive' : df_psa.query('case_type == "inter" and alt_type == "passive"'),
		'df_exo' : df_psa.query('case_type == "exog" or case_type == "exo1" or case_type == "exo2"'),
		# 'df_zero': df_psa.query('case_type != "null" and case_type != "exog" and case_type != "exo1" and case_type != "exo2" and case_type != "dep"'),
		'df_zero': df_psa.query('case_type != "null" and case_type != "dep"'),
	}
	
	return df_dict

In [None]:
juman_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_juman3.jsonl')

In [11]:
# mecab_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_closest.jsonl')
mecab_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_closest_parse.jsonl')

In [66]:
mecab_dfdict['df_zero'].query('result == "fp"')

Unnamed: 0,output_token,gold_arguments,case_name,case_type,predicate,alt_type,output_sentence,input_tokens,result
9,鳥,[ユリカモメ],ga,inter,引っ掛けて,active,<pad> 鳥が糸を引っ掛けて</s> <pad> <pad>,東京・上野の不忍池で、無残な姿の鳥が目立つ。片足が切れたユリカモメ釣り糸を<extra_id...,fp
12,鳥,[釣り糸],ga,intra,取れ,active,<pad> 鳥が糸を取れ</s> <pad> <pad> <pad> <pad>,東京・上野の不忍池で、無残な姿の鳥が目立つ。片足が切れたユリカモメ釣り糸を引っ掛けて<ext...,fp
21,竹ぐし,[しが],ga,intra,突き刺さった,active,<pad> 竹ぐしが首を突き刺さった</s>,東京・上野の不忍池で、無残な姿の鳥が目立つ。片足が切れたユリカモメ釣り糸を引っ掛けて取れなく...,fp
36,くし,[ゴム],ga,intra,入って,active,<pad> くしが首に入って</s> <pad> <pad> <pad> <pad>,東京・上野の不忍池で、無残な姿の鳥が目立つ。片足が切れたユリカモメ釣り糸を引っ掛けて取れなく...,fp
105,法人,[省],ga,intra,多い,active,<pad> 法人が多い</s> <pad> <pad> <pad> <pad> <pad>,村山連立政権の最重要政策課題になっている特殊法人の整理・合理化で、通産省は日本貿易振興会とア...,fp
...,...,...,...,...,...,...,...,...,...
79038,さ,[官僚],ga,intra,して,active,<pad> さが国民をして</s> <pad> <pad>,法学部の一人の学生が書いた投稿が、元日付東京大学新聞に載る。投稿掲載は珍しくもないが、内容が...,fp
79044,社会,"[日本, 国, 国民, 人]",ga,inter,経て,active,<pad> 社会が年を経て</s> <pad> <pad>,法学部の一人の学生が書いた投稿が、元日付東京大学新聞に載る。投稿掲載は珍しくもないが、内容が...,fp
79083,コスト,[社会],ga,intra,よい,active,<pad> コストがよい</s> <pad> <pad> <pad>,法学部の一人の学生が書いた投稿が、元日付東京大学新聞に載る。投稿掲載は珍しくもないが、内容が...,fp
79086,義務,[<extra_id_99>],ga,exog,し,active,<pad> 義務が責任をし</s> <pad>,法学部の一人の学生が書いた投稿が、元日付東京大学新聞に載る。投稿掲載は珍しくもないが、内容が...,fp


In [None]:
df_juman_inter = juman_dfdict['df_inter']
df_juman_inter = df_juman_inter.rename(columns={"result":"juman_result","output_token":"juman_output_token"})
df_juman_inter = df_juman_inter.loc[:,['juman_output_token','output_sentence','juman_result']]


In [None]:
df_mecab_inter = mecab_dfdict['df_inter']

In [None]:
df_merge = df_mecab_inter.join(df_juman_inter)

In [16]:
for df_name,df in mecab_dfdict.items():
	print_scores(df_name,df)

DataFrame = df_dep
precision : 0.9464862923500539 (23718/25059)
recall : 0.9154347909992666 (23718/25909)
f1 : 0.9307016167006749

DataFrame = df_intra
precision : 0.7322658402203857 (4253/5808)
recall : 0.6898621248986212 (4253/6165)
f1 : 0.7104318048943455

DataFrame = df_inter
precision : 0.4992183428869203 (1916/3838)
recall : 0.46811629611531885 (1916/4093)
f1 : 0.48316731811877445

DataFrame = df_dep_passive
precision : 0.9224190592547342 (1510/1637)
recall : 0.8688147295742232 (1510/1738)
f1 : 0.8948148148148148

DataFrame = df_intra_passive
precision : 0.5914893617021276 (278/470)
recall : 0.5440313111545988 (278/511)
f1 : 0.5667686034658511

DataFrame = df_inter_passive
precision : 0.29222520107238603 (109/373)
recall : 0.26520681265206814 (109/411)
f1 : 0.2780612244897959

DataFrame = df_exo
precision : 0.7031208322219259 (2636/3749)
recall : 0.6889702038682697 (2636/3826)
f1 : 0.6959735973597361

DataFrame = df_zero
precision : 0.6573348264277715 (8805/13395)
recall : 0.6251