In [9]:
import pandas as pd
import os
from glob import glob
from reports import get_average_classification_report
pd.options.display.float_format = '{:.2f}'.format
from functools import reduce
pd.options.display.float_format = '{:.2f}'.format

In [10]:
root_dir = '/media/dmlab/My Passport/DATA/CABERT'
filepaths = sorted(glob(os.path.join(root_dir, 'result_*', 'classification_report_train_*.csv')))
dirnames = sorted(list(set([os.path.dirname(filepath) for filepath in filepaths])))
dirnames

['/media/dmlab/My Passport/DATA/CABERT/result_ProsusAI_finbert',
 '/media/dmlab/My Passport/DATA/CABERT/result_bert-base-uncased',
 '/media/dmlab/My Passport/DATA/CABERT/result_ernie-2.0-en',
 '/media/dmlab/My Passport/DATA/CABERT/result_item1_business_pt_ernie-2.0-en_with_company_masking_first=False_item1_business',
 '/media/dmlab/My Passport/DATA/CABERT/result_item1_business_pt_ernie-2.0-en_with_company_masking_first=True_item1_business',
 '/media/dmlab/My Passport/DATA/CABERT/result_item1_business_pt_ernie-2.0-en_wo_company_masking_first=None_item1_business',
 '/media/dmlab/My Passport/DATA/CABERT/result_item1_business_pt_finbert_with_company_masking_first=False_item1_business',
 '/media/dmlab/My Passport/DATA/CABERT/result_item1_business_pt_finbert_with_company_masking_first=True_item1_business',
 '/media/dmlab/My Passport/DATA/CABERT/result_item1_business_pt_finbert_wo_company_masking_first=None_item1_business',
 '/media/dmlab/My Passport/DATA/CABERT/result_with_company_masking_fi

### Knowledge-enhanced model은 데이터가 적을 때에도 비교적 좋은 성능을 냄
* BERT: general domain에서 습득한 언어지식을 이용하여 spam 분류
* ERNIE: entity-specific 지식을 추가습득하여 spam 분류

### Pilot 결과 해석
* post-train 데이터셋 사이즈 작음
    - (T5) 적은 데이터 반복해서 pre-train하면 overfitting 되어서 안좋음=> dup=1로
* ERNIE는 엄~청 많은 데이터로 post-train 한 거고, CABERT는 소량의 데이터로 post-train한건데도 비등한 결과 나옴. 
* trained_with (400, 1000)에서 BERT는 일단 이겼음

### TO-DO: ERNIE + post-training해서 결과 어떻게 나오는지 확인해보자

In [16]:
def get_df(filepaths, colname):
    records = []
    for report_filepath in filepaths:
        mode = os.path.basename(report_filepath).split('_')[-1].replace('.csv', '')
        df = pd.read_csv(report_filepath).set_index('Unnamed: 0')
        acc = 100*df.filter(items = ['accuracy'], axis=0)['f1-score'].values[0]
        records.append((int(mode), acc))
    return pd.DataFrame(records, columns=['trained_with', colname]).sort_values(by=['trained_with'])

dfs = []
for dirname in dirnames:
    filepaths = glob(os.path.join(dirname, 'classification_report_*.csv'))
    filepaths = [item for item in filepaths if int(item.split('_')[-1].replace('.csv', ''))>=300]
#     filepaths = [item for item in filepaths if int(item.split('_')[-1].replace('.csv', ''))<40000]
    name = os.path.basename(dirname.replace('result_', ''))
    df = get_df(filepaths, name)    
    dfs.append(df)    
result = reduce(lambda df1,df2: pd.merge(df1,df2,on='trained_with'), dfs)
result.set_index('trained_with', inplace=True)
result.loc['Average'] = result.mean()

filepath = os.path.join(root_dir, 'classification_report.csv')
result.to_csv(filepath)
print('Created {}'.format(filepath))

result

Created /media/dmlab/My Passport/DATA/CABERT/classification_report.csv


Unnamed: 0_level_0,ProsusAI_finbert,bert-base-uncased,ernie-2.0-en,item1_business_pt_ernie-2.0-en_with_company_masking_first=False_item1_business,item1_business_pt_ernie-2.0-en_with_company_masking_first=True_item1_business,item1_business_pt_ernie-2.0-en_wo_company_masking_first=None_item1_business,item1_business_pt_finbert_with_company_masking_first=False_item1_business,item1_business_pt_finbert_with_company_masking_first=True_item1_business,item1_business_pt_finbert_wo_company_masking_first=None_item1_business,with_company_masking_first=False_item1_business,with_company_masking_first=True_item1_business,wo_company_masking_first=None_item1_business
trained_with,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
300,33.12,56.48,54.44,54.59,55.04,49.47,50.02,62.93,62.19,53.24,52.51,49.97
400,41.72,65.37,60.28,52.1,58.09,51.26,50.0,62.89,62.89,58.69,62.38,51.86
500,49.92,66.11,70.97,66.37,69.58,70.93,50.32,67.48,65.06,69.88,73.35,76.35
600,52.43,63.57,74.23,61.77,68.91,69.35,53.12,73.35,68.25,60.73,71.57,69.62
700,63.86,67.59,73.26,65.94,71.74,73.84,62.87,75.89,63.21,73.02,75.41,72.67
800,63.76,71.81,75.75,66.05,74.56,75.76,69.64,76.74,71.88,69.59,73.89,72.25
900,70.45,70.44,79.26,67.08,76.38,78.5,67.66,79.12,66.65,74.89,73.5,77.84
1000,72.84,75.21,79.41,73.75,77.78,76.67,74.69,80.19,75.53,77.8,72.81,72.11
2000,77.95,80.68,80.81,80.26,81.22,81.49,80.71,81.46,80.22,80.37,80.16,80.45
10000,84.91,85.09,85.61,85.15,85.08,85.46,84.1,84.74,84.36,84.78,84.63,84.96


In [15]:
low = result.filter(items = [300, 400, 500, 600, 700, 800], axis=0)
low.loc['Average'] = low.mean()

filepath = os.path.join(root_dir, 'classification_report_Low.csv')
low.to_csv(filepath)
print('Created {}'.format(filepath))

low

Created /media/dmlab/My Passport/DATA/CABERT/classification_report_Low.csv


Unnamed: 0_level_0,ProsusAI_finbert,bert-base-uncased,ernie-2.0-en,item1_business_pt_ernie-2.0-en_with_company_masking_first=False_item1_business,item1_business_pt_ernie-2.0-en_with_company_masking_first=True_item1_business,item1_business_pt_ernie-2.0-en_wo_company_masking_first=None_item1_business,item1_business_pt_finbert_with_company_masking_first=False_item1_business,item1_business_pt_finbert_with_company_masking_first=True_item1_business,item1_business_pt_finbert_wo_company_masking_first=None_item1_business,with_company_masking_first=False_item1_business,with_company_masking_first=True_item1_business,wo_company_masking_first=None_item1_business
trained_with,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
300,33.12,56.48,54.44,54.59,55.04,49.47,50.02,62.93,62.19,53.24,52.51,49.97
400,41.72,65.37,60.28,52.1,58.09,51.26,50.0,62.89,62.89,58.69,62.38,51.86
500,49.92,66.11,70.97,66.37,69.58,70.93,50.32,67.48,65.06,69.88,73.35,76.35
600,52.43,63.57,74.23,61.77,68.91,69.35,53.12,73.35,68.25,60.73,71.57,69.62
700,63.86,67.59,73.26,65.94,71.74,73.84,62.87,75.89,63.21,73.02,75.41,72.67
800,63.76,71.81,75.75,66.05,74.56,75.76,69.64,76.74,71.88,69.59,73.89,72.25
Average,50.8,65.15,68.15,61.14,66.32,65.1,56.0,69.88,65.58,64.19,68.19,65.45
