In [60]:
import glob, os
import pandas as pd
pd.options.display.float_format = '{:.4f}'.format
from functools import reduce
from reports import create_confusion_matrix, create_classification_report, average_classification_report, sum_confusion_matrix

In [61]:
DATA_DIR = './results'

FPB_REPORT_FILEPATHS =os.path.join(DATA_DIR, 'FinancialPhrase*_DS{}*', '*', 'classification_report_{}.csv')
FPB_CONF_FILEPATHS =os.path.join(DATA_DIR, 'FinancialPhrase*_DS{}*', '*', 'confusion_matrix_{}.csv')
FPB_PREDICTIONS_FILEPATHS =os.path.join(DATA_DIR, 'FinancialPhrase*_DS{}*', '*', 'predictions_{}.csv')

Average results for k-folds

In [64]:
def concat_dfs(df_filepaths, out_filepath):
    dfs = []
    for i, df_filepath in enumerate(df_filepaths):
        df = pd.read_csv(df_filepath, header=0)
        dfs.append(df)
    concat_df = pd.concat(dfs)
    concat_df.to_csv(out_filepath, index=False)
    print('Created {}'.format(out_filepath))
    return concat_df
    
ds_list = ['50', '66', '75', 'All']
names = ['word2vec-google-news-300', 'Sen140', 'SO-CAL', 'MPQA', 'AFINN', 'SentiStrength', 'bert-base-uncased', 'roberta-base']

for ds in ds_list:
    for name in names:
        report_filepaths = glob.glob(FPB_REPORT_FILEPATHS.format(ds, name))
        out_dir = os.path.dirname(os.path.dirname(report_filepaths[0]))
        average_classification_report(report_filepaths, os.path.join(out_dir, 'classification_report_{}.csv'.format(name)))
        
        conf_filepaths = glob.glob(FPB_CONF_FILEPATHS.format(ds, name))
        sum_confusion_matrix(conf_filepaths, os.path.join(out_dir, 'confusion_matrix_{}.csv'.format(name)))
        
        df_filepaths = glob.glob(FPB_PREDICTIONS_FILEPATHS.format(ds, name))
        concat_dfs(df_filepaths,os.path.join(out_dir, 'predictions_{}.csv'.format(name)))

Created ./results/FinancialPhraseBank_DS50_k-folds/classification_report_word2vec-google-news-300.csv
Created ./results/FinancialPhraseBank_DS50_k-folds/confusion_matrix_word2vec-google-news-300.csv
Created ./results/FinancialPhraseBank_DS50_k-folds/predictions_word2vec-google-news-300.csv
Created ./results/FinancialPhraseBank_DS66_k-folds/classification_report_word2vec-google-news-300.csv
Created ./results/FinancialPhraseBank_DS66_k-folds/confusion_matrix_word2vec-google-news-300.csv
Created ./results/FinancialPhraseBank_DS66_k-folds/predictions_word2vec-google-news-300.csv
Created ./results/FinancialPhraseBank_DS75_k-folds/classification_report_word2vec-google-news-300.csv
Created ./results/FinancialPhraseBank_DS75_k-folds/confusion_matrix_word2vec-google-news-300.csv
Created ./results/FinancialPhraseBank_DS75_k-folds/predictions_word2vec-google-news-300.csv
Created ./results/FinancialPhraseBank_DSAll_k-folds/classification_report_word2vec-google-news-300.csv
Created ./results/Financ

# Results

In [85]:
dfs = []
model_names = list(set([os.path.basename(item).split('_')[-1].replace('.csv', '') for item in glob.glob(os.path.join(DATA_DIR, '*', 'classification_report_*.csv'))]))
for model_name in model_names:
    filepaths = sorted(glob.glob(os.path.join(DATA_DIR, '*', 'classification_report_{}.csv'.format(model_name))))
    
    records = []
    for filepath in filepaths:
        dataset_mode = os.path.basename(os.path.dirname(filepath)).split('_')[-2]

        result = pd.read_csv(filepath).set_index('Unnamed: 0')
        records.append(('_'.join([dataset_mode, 'Precision']), result.loc['weighted avg']['precision']))
        records.append(('_'.join([dataset_mode, 'Recall']), result.loc['weighted avg']['recall']))
        records.append(('_'.join([dataset_mode, 'F1-score']), result.loc['weighted avg']['f1-score']))
    dfs.append(pd.DataFrame(records, columns=['Dataset_Measure', model_name]))

final_result = reduce(lambda df1,df2: pd.merge(df1,df2,on='Dataset_Measure'), dfs)
final_result.to_csv(os.path.join(DATA_DIR, 'FPB_SemEval_results.csv'), index=False)
final_result

Unnamed: 0,Dataset_Measure,swn,roberta-base,bert-base-uncased,senti-dd,word2vec-google-news-300,textblob,SO-CAL,SentiStrength,MPQA,Sen140,vader,lm,AFINN
0,DS50_Precision,0.4778,0.8578,0.8487,0.709,0.7392,0.5155,0.5161,0.5641,0.5143,0.5345,0.6028,0.6147,0.6332
1,DS50_Recall,0.3969,0.8523,0.8474,0.7055,0.7466,0.4852,0.4546,0.5713,0.4567,0.2196,0.5396,0.6232,0.5897
2,DS50_F1-score,0.4107,0.8523,0.8457,0.7001,0.7328,0.4953,0.4575,0.5644,0.463,0.154,0.5452,0.5914,0.596
3,DS66_Precision,0.4851,0.8948,0.8827,0.7389,0.765,0.5275,0.5248,0.5794,0.5215,0.5514,0.6194,0.6337,0.6504
4,DS66_Recall,0.4044,0.8826,0.875,0.7315,0.7741,0.4968,0.4588,0.5849,0.4631,0.2112,0.5534,0.6363,0.6054
5,DS66_F1-score,0.4194,0.8844,0.8757,0.7271,0.7622,0.507,0.4633,0.5795,0.4708,0.1462,0.5599,0.6023,0.613
6,DS75_Precision,0.4916,0.9366,0.9222,0.7796,0.8017,0.5426,0.5444,0.6063,0.5284,0.55,0.6409,0.6507,0.6713
7,DS75_Recall,0.4009,0.9341,0.9188,0.7702,0.8103,0.5039,0.4616,0.6069,0.4601,0.1984,0.559,0.6556,0.6159
8,DS75_F1-score,0.4199,0.9344,0.9185,0.7673,0.7986,0.5169,0.4725,0.6049,0.4726,0.1344,0.5702,0.6174,0.6265
9,DSAll_Precision,0.4624,0.9642,0.8733,0.8238,0.8125,0.5476,0.5431,0.6137,0.5186,0.5581,0.6405,0.6377,0.6868


### DS100

In [86]:
relabel_dict = {'negative': '0', 'neutral': '1', 'positive': '2'}

dfs = []
model_names = list(set([os.path.basename(item).split('_')[-1].replace('.csv', '') for item in glob.glob(os.path.join(DATA_DIR, '*', 'classification_report_*.csv'))]))
for model_name in model_names:
    filepaths = sorted(glob.glob(os.path.join(DATA_DIR, '*DSAll*', 'classification_report_{}.csv'.format(model_name))))
    
    records = []
    for filepath in filepaths:
        dataset_mode = os.path.basename(os.path.dirname(filepath)).split('_')[-2]

        result = pd.read_csv(filepath).set_index('Unnamed: 0')
        for class_mode in ['positive', 'negative', 'neutral']:
            try: item = result.loc[class_mode]
            except: item = result.loc[relabel_dict[class_mode]]
            records.append(('_'.join([class_mode, 'Precision']), item['precision']))
            records.append(('_'.join([class_mode, 'Recall']), item['recall']))
            records.append(('_'.join([class_mode, 'F1-score']), item['f1-score']))
    dfs.append(pd.DataFrame(records, columns=['Dataset_Measure', model_name]))

final_result = reduce(lambda df1,df2: pd.merge(df1,df2,on='Dataset_Measure'), dfs)
final_result.to_csv(os.path.join(DATA_DIR, 'FPB_DS100_results.csv'), index=False)
final_result

Unnamed: 0,Dataset_Measure,swn,roberta-base,bert-base-uncased,senti-dd,word2vec-google-news-300,textblob,SO-CAL,SentiStrength,MPQA,Sen140,vader,lm,AFINN
0,positive_Precision,0.2284,0.9422,0.7025,0.8528,0.6886,0.3439,0.3158,0.4465,0.3026,0.1797,0.3889,0.6371,0.4574
1,positive_Recall,0.3132,0.9506,0.8243,0.6101,0.6959,0.3931,0.6121,0.3922,0.5208,0.1864,0.7384,0.1632,0.7298
2,positive_F1-score,0.2637,0.9458,0.7562,0.7106,0.6918,0.3668,0.4163,0.4172,0.3827,0.1827,0.5085,0.2596,0.5615
3,negative_Precision,0.2562,0.8928,0.9029,0.6264,0.7565,0.3689,0.2282,0.3321,0.3711,0.1782,0.3479,0.4184,0.4196
4,negative_Recall,0.469,0.9607,0.421,0.834,0.3767,0.4768,0.1475,0.3652,0.3317,0.9466,0.2255,0.4004,0.3369
5,negative_F1-score,0.331,0.9234,0.525,0.7145,0.5008,0.4143,0.1782,0.3471,0.3497,0.2997,0.272,0.4087,0.3694
6,neutral_Precision,0.6017,0.9877,0.9372,0.8544,0.8739,0.6686,0.7041,0.7435,0.6385,0.7959,0.8058,0.6852,0.8392
7,neutral_Recall,0.3988,0.9663,0.9705,0.8918,0.9653,0.585,0.4857,0.765,0.4629,0.0332,0.5743,0.9005,0.6686
8,neutral_F1-score,0.479,0.9766,0.9532,0.8725,0.9173,0.6238,0.5748,0.7539,0.5366,0.0633,0.6705,0.7782,0.7439
