In [15]:
import pandas as pd
import os
from glob import glob
from reports import get_average_classification_report
pd.options.display.float_format = '{:.2f}'.format

In [18]:
no_pt_filepaths_format = '/media/dmlab/My Passport/DATA/cross-domain/finetune_{}/source=*_post=None_target=*/classification_report.csv'
pt_st_filepaths_format = '/media/dmlab/My Passport/DATA/cross-domain/finetune_{}/source=*_post=ST_target=*/classification_report.csv'
pt_t_filepaths_format = '/media/dmlab/My Passport/DATA/cross-domain/finetune_{}/source=*_post=T_target=*/classification_report.csv'
pt_simst_filepaths_format = '/media/dmlab/My Passport/DATA/cross-domain/prev_LSimST/finetune_{}/source=*_post=SimST_target=*/classification_report.csv'
kfold_num = 5

In [3]:
def show_result(filepaths_format, kfold_num=1):
    kfold_reports = []
    for i in range(0, kfold_num):
        filepaths = glob(filepaths_format.format(i))

        records = []
        for filepath in filepaths:
            source, post, target = os.path.basename(os.path.dirname(filepath)).split('_')
            source, target = source.replace('source=',''), target.replace('target=', '')

            df = pd.read_csv(filepath)
            df.set_index('Unnamed: 0', inplace=True)
            acc = 100*df.filter(items = ['accuracy'], axis=0)['f1-score'].values[0]

            records.append(('{}->{}'.format(source, target), float(acc)))

        result_df = pd.DataFrame(records, columns = ['source->target', '{}_accuracy'])
        kfold_reports.append(result_df)

    result_df = get_average_classification_report(kfold_reports)
    result_df.loc['Average'] = result_df.mean()
    return result_df

* Post-training baselines
    1. No post-training
    2. Source+Target MLM 
        - (# of source texts = 4,000 & # of target texts = 4,000)
    3. Target MLM 
        - (# of target texts = 4,000)
* Proposed 
    * (using labeled target) SimSource+Target MLM 
        - (# of source texts = 600~800 & # of target texts = 4,000)
        ```
        Source=books, Target=dvd, Number of similar texts=4816
        Source=books, Target=electronics, Number of similar texts=4684
        Source=books, Target=kitchen, Number of similar texts=4654
        Source=dvd, Target=books, Number of similar texts=4884
        Source=dvd, Target=electronics, Number of similar texts=4745
        Source=dvd, Target=kitchen, Number of similar texts=4716
        Source=electronics, Target=books, Number of similar texts=4744
        Source=electronics, Target=dvd, Number of similar texts=4769
        Source=electronics, Target=kitchen, Number of similar texts=4895
        Source=kitchen, Target=books, Number of similar texts=4742
        Source=kitchen, Target=dvd, Number of similar texts=4741
        Source=kitchen, Target=electronics, Number of similar texts=4873
        ```
        

* 파인튜닝: Raw 소스 (train), Raw 타겟 (test)
* 5-fold

In [20]:
no_pt = show_result(no_pt_filepaths_format, kfold_num=kfold_num)
no_pt.columns = ['No post-training']

pt_st = show_result(pt_st_filepaths_format, kfold_num=kfold_num)
pt_st.columns = ['Source+Target MLM']

pt_t = show_result(pt_t_filepaths_format, kfold_num=kfold_num)
pt_t.columns = ['Target MLM']

pt_simst = show_result(pt_simst_filepaths_format, kfold_num=kfold_num)
pt_simst.columns = ['SimSource+Target MLM']

result = pd.concat([no_pt, pt_st, pt_t, pt_simst], axis=1)
result

Unnamed: 0,No post-training,Source+Target MLM,Target MLM,SimSource+Target MLM
books->dvd,88.5,86.35,90.25,86.2
books->electronics,82.8,86.22,87.1,88.62
books->kitchen,86.65,90.15,88.85,90.9
dvd->books,85.05,90.85,89.9,90.85
dvd->electronics,87.85,90.85,90.45,90.7
dvd->kitchen,87.7,90.5,91.5,91.45
electronics->books,86.25,89.55,89.95,90.1
electronics->dvd,86.1,89.45,88.9,89.3
electronics->kitchen,90.2,92.7,92.8,92.85
kitchen->books,84.65,87.2,90.2,88.95
