In [1]:
import glob, os
import pandas as pd
from reports import create_confusion_matrix, create_classification_report, average_classification_report, sum_confusion_matrix
from lexicons import senti_dd_polarity, lm_polarity, vader_polarity, swn_polarity, textblob_polarity
from senti_dd_construction import construct_senti_dd

In [2]:
DATA_DIR = '/media/dmlab/My Passport/DATA/research-notes/lexicon'
FPB_TRAIN_FILEPATHS = glob.glob(os.path.join(DATA_DIR, 'FinancialPhrase*', '*', 'train.csv'))
SEMEVAL_TRAIN_FILEPATH = os.path.join(DATA_DIR, 'SemEval-test (FPB-train)', 'train.csv')
FIQA_TRAIN_FILEPATH = os.path.join(DATA_DIR, 'FiQA-test (FPB-train)', 'train.csv')
FPB_TEST_FILEPATHS = glob.glob(os.path.join(DATA_DIR, 'FinancialPhrase*', '*', 'test.csv'))
SEMEVAL_TEST_FILEPATH = os.path.join(DATA_DIR, 'SemEval-test (FPB-train)', 'test.csv')
FIQA_TEST_FILEPATH = os.path.join(DATA_DIR, 'FiQA-test (FPB-train)', 'test.csv')

FPB_REPORT_FILEPATHS =os.path.join(DATA_DIR, 'FinancialPhrase*_DS{}*', '*', 'classification_report_{}.csv')
FPB_CONF_FILEPATHS =os.path.join(DATA_DIR, 'FinancialPhrase*_DS{}*', '*', 'confusion_matrix_{}.csv')
FPB_PREDICTIONS_FILEPATHS =os.path.join(DATA_DIR, 'FinancialPhrase*_DS{}*', '*', 'predictions_{}.csv')

Construct Senti-DD

In [None]:
filepaths = FPB_TRAIN_FILEPATHS
filepaths.extend([SEMEVAL_TRAIN_FILEPATH, FIQA_TRAIN_FILEPATH])
for train_filepath in filepaths:
    print('Processing {}..'.format(train_filepath))
    dd_filepath = os.path.join(os.path.dirname(train_filepath), 'direction_dependent_entities.csv')
    senti_dd_filepath = os.path.join(os.path.dirname(train_filepath), 'Senti-DD.csv')
    construct_senti_dd(train_filepath, dd_filepath, senti_dd_filepath)

Predict polarity based on lexicons

In [None]:
filepaths = FPB_TEST_FILEPATHS
filepaths.extend([SEMEVAL_TEST_FILEPATH, FIQA_TEST_FILEPATH])
lexicons = [(senti_dd_polarity, 'senti-dd'), (lm_polarity, 'lm'), (vader_polarity, 'vader'), (swn_polarity, 'swn'), (textblob_polarity, 'textblob')]
for test_filepath in filepaths:
    for lexicon_func, lexicon_name in lexicons:
        print('Processing {} with {}..'.format(test_filepath, lexicon_name))
        df_filepath = os.path.join(os.path.dirname(test_filepath), 'predictions_{}.csv')
        conf_filepath = os.path.join(os.path.dirname(test_filepath), 'confusion_matrix_{}.csv')
        report_filepath = os.path.join(os.path.dirname(test_filepath), 'classification_report_{}.csv')

        df = pd.read_csv(test_filepath)

        if lexicon_name == 'senti-dd':
            senti_dd_filepath = os.path.join(os.path.dirname(test_filepath), 'Senti-DD.csv')
            senti_dd = pd.read_csv(senti_dd_filepath)
            df['prediction'] = df['headline'].apply(lambda x: senti_dd_polarity(x, senti_dd))
        else:         
            df['prediction'] = df['headline'].apply(lambda x: lexicon_func(x))
        
        df['correct'] = df.apply(lambda x: x['label']==x['prediction'], axis=1)
        df.to_csv(df_filepath.format(lexicon_name), index=False)

        labels, preds = df.label, df.prediction
        create_confusion_matrix(labels, preds, conf_filepath.format(lexicon_name))
        accuracy = len(df[df['correct']==True]) / len(df)
        create_classification_report(labels, preds, accuracy, report_filepath.format(lexicon_name))

Average results for k-folds

In [5]:
def concat_dfs(df_filepaths, out_filepath):
    dfs = []
    for i, df_filepath in enumerate(df_filepaths):
        df = pd.read_csv(df_filepath, header=0)
        dfs.append(df)
    concat_df = pd.concat(dfs)
    concat_df.to_csv(out_filepath, index=False)
    print('Created {}'.format(out_filepath))
    return concat_df
    
ds_list = ['50', '66', '75', 'All']
lexicon_names = ['senti-dd', 'lm', 'vader', 'swn', 'textblob']

for ds in ds_list:
    for lexicon_name in lexicon_names:
        report_filepaths = glob.glob(FPB_REPORT_FILEPATHS.format(ds, lexicon_name))
        out_dir = os.path.dirname(os.path.dirname(report_filepaths[0]))
        average_classification_report(report_filepaths, os.path.join(out_dir, 'classification_report_{}.csv'.format(lexicon_name)))
        
        conf_filepaths = glob.glob(FPB_CONF_FILEPATHS.format(ds, lexicon_name))
        sum_confusion_matrix(conf_filepaths, os.path.join(out_dir, 'confusion_matrix_{}.csv'.format(lexicon_name)))
        
        df_filepaths = glob.glob(FPB_PREDICTIONS_FILEPATHS.format(ds, lexicon_name))
        concat_dfs(df_filepaths,os.path.join(out_dir, 'predictions_{}.csv'.format(lexicon_name)))

Created /media/dmlab/My Passport/DATA/research-notes/lexicon/FinancialPhraseBank_DS50 (k-folds)/classification_report_senti-dd.csv
Created /media/dmlab/My Passport/DATA/research-notes/lexicon/FinancialPhraseBank_DS50 (k-folds)/confusion_matrix_senti-dd.csv
Created /media/dmlab/My Passport/DATA/research-notes/lexicon/FinancialPhraseBank_DS50 (k-folds)/predictions_senti-dd.csv
Created /media/dmlab/My Passport/DATA/research-notes/lexicon/FinancialPhraseBank_DS50 (k-folds)/classification_report_lm.csv
Created /media/dmlab/My Passport/DATA/research-notes/lexicon/FinancialPhraseBank_DS50 (k-folds)/confusion_matrix_lm.csv
Created /media/dmlab/My Passport/DATA/research-notes/lexicon/FinancialPhraseBank_DS50 (k-folds)/predictions_lm.csv
Created /media/dmlab/My Passport/DATA/research-notes/lexicon/FinancialPhraseBank_DS50 (k-folds)/classification_report_vader.csv
Created /media/dmlab/My Passport/DATA/research-notes/lexicon/FinancialPhraseBank_DS50 (k-folds)/confusion_matrix_vader.csv
Created /me