In [26]:
import json
import pandas as pd
from transformers import AutoTokenizer

In [27]:
bert_tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')

In [28]:
def print_scores(df_name:str,df):
	value_count = df['result'].value_counts()
	tp = value_count['tp']
	fp = value_count['fp']
	fn = value_count['fn']
	
	precision = tp/(tp + fp)
	recall = tp/(tp + fn + fp)
	
	f1 = 2*recall*precision/(recall + precision)
	print(f'DataFrame = {df_name}\nprecision : {precision} ({tp}/{tp + fp})\nrecall : {recall} ({tp}/{len(df)})\nf1 : {f1}\n')


In [76]:
def make_dfdict(decoded_path:str) ->dict:
	f = open(decoded_path,mode='r')
	df_psa = pd.read_json(f,orient='records',lines=True)
	df_psa['result'] = 'tn'
	for index,row in df_psa.iterrows():
		if(row['output_token'] == ''):
			if(row['case_type'] == 'null'):
				continue
			else:
				df_psa.at[index,'result'] = 'fn'
		else:
			output_token = row['output_token'].translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) #半角文字を全角文字に変換
			gold_arguments = [gold.translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) for gold in row['gold_arguments']]
			# subword_gold_argument = [bert_tokenizer.tokenize(argument)[-1] for argument in gold_arguments]
			# if(bert_tokenizer.tokenize(output_token)[-1] in subword_gold_argument):
			# if(output_token in gold_arguments):
			# subword_gold_argument = [bert_tokenizer.tokenize(argument)[-1].replace('#','') for argument in gold_arguments]
			output_sub_char = bert_tokenizer.tokenize(output_token)[-1].replace('#','').translate(str.maketrans({chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)}))
			in_flag = False
			for arg_sub_char in gold_arguments:
				if (output_sub_char in arg_sub_char):
					in_flag = True
			if(in_flag):
				df_psa.at[index,'result'] = 'tp'
			else:
				df_psa.at[index,'result'] = 'fp'
	
	df_dict = {
		'df_dep' : df_psa.query('case_type == "dep"'),
		'df_intra' : df_psa.query('case_type == "intra"'),
		'df_inter' : df_psa.query('case_type == "inter"'),
		'df_dep_passive' : df_psa.query('case_type == "dep" and alt_type == "passive"'),
		'df_intra_passive' : df_psa.query('case_type == "intra" and alt_type == "passive"'),
		'df_exo' : df_psa.query('case_type == "exog" or case_type == "exo1" or case_type == "exo2"'),
		'df_all': df_psa
	}
	
	return df_dict

In [None]:
juman_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_juman3.jsonl')

In [77]:
mecab_dfdict = make_dfdict('/home/sibava/PAS-T5/decoded/decoded_psa_closest.jsonl')

In [78]:
mecab_dfdict['df_dep'].query('result =="fp"')[50:100]

Unnamed: 0,output_token,gold_arguments,case_name,case_type,predicate,alt_type,input_tokens,result
3078,者,[発言],ga,dep,認めた,active,トウ小平氏の三女蕭榕さんが父親の健康状態の悪化を認めたことで、十三日の香港株式市場は株価が急...,fp
3087,氏,[状態],ga,dep,かかわる,active,トウ小平氏の三女蕭榕さんが父親の健康状態の悪化を認めたことで、十三日の香港株式市場は株価が急...,fp
3131,撮影,[集],ni,dep,登場して,active,十三日付の北京日報は、江沢民国家主席が「万衆一心」と書いた書を一面トップで扱った。また中国主...,fp
3142,書,[集],o,dep,売ら,passive,十三日付の北京日報は、江沢民国家主席が「万衆一心」と書いた書を一面トップで扱った。また中国主...,fp
3267,代表,[人材],ga,dep,退か,active,韓国野党・民主党の李基沢代表は十三日、済州島で韓国記者団と懇談し、「どんなにすばらしい人材で...,fp
3305,側近,[ら],ni,dep,漏らした,active,韓国野党・民主党の李基沢代表は十三日、済州島で韓国記者団と懇談し、「どんなにすばらしい人材で...,fp
3334,聴取,[リスナー],o,dep,奪わ,passive,米連邦通信委員会は十二日、衛星を利用して全米どこでも聞くことのできるラジオ放送にゴーサインを...,fp
3345,会,[ラジオ],ga,dep,提供,active,米連邦通信委員会は十二日、衛星を利用して全米どこでも聞くことのできるラジオ放送にゴーサインを...,fp
3389,地区,[部],ni,dep,ある,active,チェチェン共和国の首都グロズヌイでは十三日も朝からロシア軍とドゥダエフ政権部隊の間で激戦が行...,fp
3420,党,[議会],ga,dep,強めた,active,保守色を<extra_id_0>強めた<extra_id_1>共和党主導の米新議会は四日開幕...,fp


In [None]:
df_juman_inter = juman_dfdict['df_inter']
df_juman_inter = df_juman_inter.rename(columns={"result":"juman_result","output_token":"juman_output_token"})
df_juman_inter = df_juman_inter.loc[:,['juman_output_token','output_sentence','juman_result']]


NameError: name 'juman_dfdict' is not defined

In [None]:
df_mecab_inter = mecab_dfdict['df_inter']

In [None]:
df_merge = df_mecab_inter.join(df_juman_inter)

In [79]:
for df_name,df in mecab_dfdict.items():
	print_scores(df_name,df)

DataFrame = df_dep
precision : 0.9415592144415791 (23732/25205)
recall : 0.9159751437724343 (23732/25909)
f1 : 0.9285909926830223

DataFrame = df_intra
precision : 0.6853218884120171 (3992/5825)
recall : 0.6475263584752636 (3992/6165)
f1 : 0.6658882402001669

DataFrame = df_inter
precision : 0.40433873497124934 (1547/3826)
recall : 0.37796237478622036 (1547/4093)
f1 : 0.39070589720924354

DataFrame = df_dep_passive
precision : 0.905952380952381 (1522/1680)
recall : 0.8757192174913694 (1522/1738)
f1 : 0.890579286132241

DataFrame = df_intra_passive
precision : 0.5443037974683544 (258/474)
recall : 0.5048923679060665 (258/511)
f1 : 0.5238578680203045

DataFrame = df_exo
precision : 0.7528781793842034 (2812/3735)
recall : 0.7349712493465761 (2812/3826)
f1 : 0.743816955429176

DataFrame = df_all
precision : 0.7882413640607341 (32083/40702)
recall : 0.7619941098232947 (32083/79101)
f1 : 0.7748955389706061

