In [1]:
import pandas as pd
import os
from glob import glob
from reports import get_average_classification_report
pd.options.display.float_format = '{:.2f}'.format
from functools import reduce
pd.options.display.float_format = '{:.2f}'.format

In [2]:
root_dir = '/media/dmlab/My Passport/DATA/ComBERT'
filepaths = sorted(glob(os.path.join(root_dir, 'result_*', 'classification_report_train_*.csv')))
dirnames = sorted(list(set([os.path.dirname(filepath) for filepath in filepaths])))
dirnames

['/media/dmlab/My Passport/DATA/ComBERT/result_pt_bert-base-uncased_with_company_masking_first=True',
 '/media/dmlab/My Passport/DATA/ComBERT/result_pt_bert-base-uncased_wo_company_masking_first=None',
 '/media/dmlab/My Passport/DATA/ComBERT/result_pt_finbert-pretrain_with_company_masking_first=True',
 '/media/dmlab/My Passport/DATA/ComBERT/result_pt_finbert-pretrain_wo_company_masking_first=None',
 '/media/dmlab/My Passport/DATA/ComBERT/result_pt_finbert_with_company_masking_first=True',
 '/media/dmlab/My Passport/DATA/ComBERT/result_pt_finbert_wo_company_masking_first=None',
 '/media/dmlab/My Passport/DATA/ComBERT/result_pt_sec-bert-base_with_company_masking_first=True',
 '/media/dmlab/My Passport/DATA/ComBERT/result_pt_sec-bert-base_wo_company_masking_first=None']

# Result
* `yiyanghkust/finbert-pretrain`이 가장 좋은 성능을 보임
> finbert-pretrain의 사전 학습 데이터: Corporate Reports 10-K & 10-Q: 2.5B tokens, Earnings Call Transcripts: 1.3B tokens, Analyst Reports: 1.1B tokens

* `Company name masking`을 적용할 때 성능이 향상됨

In [5]:
def get_df(filepaths, colname):
    records = []
    for report_filepath in filepaths:
        mode = os.path.basename(report_filepath).split('_')[-1].replace('.csv', '')
        df = pd.read_csv(report_filepath).set_index('Unnamed: 0')
        acc = 100*df.filter(items = ['accuracy'], axis=0)['f1-score'].values[0]
        records.append((int(mode), acc))
    return pd.DataFrame(records, columns=['trained_with', colname]).sort_values(by=['trained_with'])

dfs = []
for dirname in dirnames:
    filepaths = glob(os.path.join(dirname, 'classification_report_*.csv'))
    filepaths = [item for item in filepaths if int(item.split('_')[-1].replace('.csv', ''))>=200]
# #     filepaths = [item for item in filepaths if int(item.split('_')[-1].replace('.csv', ''))<40000]
    name = os.path.basename(dirname.replace('result_', ''))
    df = get_df(filepaths, name)    
    dfs.append(df)    
result = reduce(lambda df1,df2: pd.merge(df1,df2,on='trained_with'), dfs)
result.set_index('trained_with', inplace=True)
result.loc['Average'] = result.mean()

filepath = os.path.join(root_dir, 'classification_report.csv')
result.to_csv(filepath)
print('Created {}'.format(filepath))

result

Created /media/dmlab/My Passport/DATA/ComBERT/classification_report.csv


Unnamed: 0_level_0,pt_bert-base-uncased_with_company_masking_first=True,pt_bert-base-uncased_wo_company_masking_first=None,pt_finbert-pretrain_with_company_masking_first=True,pt_finbert-pretrain_wo_company_masking_first=None,pt_finbert_with_company_masking_first=True,pt_finbert_wo_company_masking_first=None,pt_sec-bert-base_with_company_masking_first=True,pt_sec-bert-base_wo_company_masking_first=None
trained_with,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
200,59.57,49.0,64.06,52.94,59.25,53.83,64.52,59.6
300,56.21,49.51,63.59,53.54,56.74,59.11,59.75,59.15
400,67.98,50.74,72.7,58.82,60.21,60.96,67.41,62.61
500,72.54,71.24,77.39,73.45,75.84,70.23,73.06,69.5
600,76.39,62.08,76.13,72.67,79.77,61.99,76.24,69.02
700,75.79,70.64,78.35,74.44,79.63,74.94,73.75,71.96
800,70.76,74.69,79.02,76.29,80.42,68.07,76.87,73.04
900,78.54,79.83,80.29,79.17,80.43,77.94,78.57,73.14
1000,77.32,80.26,80.63,78.74,80.47,79.77,80.37,74.29
2000,82.35,80.93,82.46,80.7,81.84,80.69,80.17,81.04


* `Company name masking`의 성능 향상 효과는 finetuning 데이터가 적을 때 특히 더 효과적으로 작용함

In [6]:
low = result.filter(items = [200, 300, 400, 500, 600, 700, 800, 900], axis=0)
low.loc['Average'] = low.mean()

filepath = os.path.join(root_dir, 'classification_report_Low.csv')
low.to_csv(filepath)
print('Created {}'.format(filepath))

low

Created /media/dmlab/My Passport/DATA/ComBERT/classification_report_Low.csv


Unnamed: 0_level_0,pt_bert-base-uncased_with_company_masking_first=True,pt_bert-base-uncased_wo_company_masking_first=None,pt_finbert-pretrain_with_company_masking_first=True,pt_finbert-pretrain_wo_company_masking_first=None,pt_finbert_with_company_masking_first=True,pt_finbert_wo_company_masking_first=None,pt_sec-bert-base_with_company_masking_first=True,pt_sec-bert-base_wo_company_masking_first=None
trained_with,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
200,59.57,49.0,64.06,52.94,59.25,53.83,64.52,59.6
300,56.21,49.51,63.59,53.54,56.74,59.11,59.75,59.15
400,67.98,50.74,72.7,58.82,60.21,60.96,67.41,62.61
500,72.54,71.24,77.39,73.45,75.84,70.23,73.06,69.5
600,76.39,62.08,76.13,72.67,79.77,61.99,76.24,69.02
700,75.79,70.64,78.35,74.44,79.63,74.94,73.75,71.96
800,70.76,74.69,79.02,76.29,80.42,68.07,76.87,73.04
900,78.54,79.83,80.29,79.17,80.43,77.94,78.57,73.14
Average,69.72,63.47,73.94,67.67,71.54,65.88,71.27,67.25
