Appendix
* Target sentences: `results/FinancialPhraseBank_DSAll_k-folds/predictions_senti-dd.csv`
* Senti-DD Lexicon: `data/DS50_Entire/Senti-DD.csv`

In [1]:
import os
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import nltk
nltk.download('omw-1.4')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from itertools import product

data_filepath ='results/FinancialPhraseBank_DSAll_k-folds/predictions_senti-dd.csv'
senti_dd_lexicon_filepath = 'data/DS50_Entire/Senti-DD.csv'


data_dir = r'C:\Users\Jihye Park\OneDrive\Ph.D\연구\02.Financial Sentiment Lexicon\Data'
lm_filepath = os.path.join(data_dir, 'LM_Word_List', 'LM_Word_List.csv')

save_filepath = 'results/senti_dd_interpretability_evidence_for_appendix.csv'

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package omw-1.4 to C:\Users\Jihye
[nltk_data]     Park\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv(data_filepath)[['headline', 'label']]
lm_df = pd.read_csv(lm_filepath)
senti_dd_lexicon_df = pd.read_csv(senti_dd_lexicon_filepath)

print('sentences\n', df.head(), '\n')
print('Loughran-McDonald\n', lm_df.head(), '\n')
print('Senti-DD\n', senti_dd_lexicon_df.head(), '\n')

sentences
                                             headline     label
0  finnish developer and manufacturer of mobile p...  negative
1  the transaction is expected to be finalized by...   neutral
2  okmetic board of directors has also decided on...   neutral
3  indigo and somoncom serve 377 000 subscribers ...   neutral
4  to see a slide show of all the newest product ...   neutral 

Loughran-McDonald
            word     label
0       abandon  negative
1     abandoned  negative
2    abandoning  negative
3   abandonment  negative
4  abandonments  negative 

Senti-DD
   sentiment entity directional_word
0  positive    eur          acceler
1  positive    eur           advanc
2  positive    eur            award
3  positive    eur           better
4  positive    eur            climb 



In [3]:
def get_intersection(list_one, list_two):
    return list(set(list_one) & set(list_two))

lm_positive_words = lm_df[lm_df['label']=='positive']['word'].values
def detect_lm_positive_words(text):
    tokens = word_tokenize(text)
    return get_intersection(tokens, lm_positive_words)

lm_negative_words = lm_df[lm_df['label']=='negative']['word'].values
def detect_lm_negative_words(text):
    tokens = word_tokenize(text)
    return get_intersection(tokens, lm_negative_words)

lemmatizer=WordNetLemmatizer()
entity_list = list(senti_dd_lexicon_df['entity'].unique())
def detect_entities(text):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in word_tokenize(text)]
    return get_intersection(lemmatized_tokens, entity_list)

stemmer = PorterStemmer()
directional_word_list = list(senti_dd_lexicon_df['directional_word'].unique())
def detect_directional_words(text):
    stemmed_tokens = [stemmer.stem(token) for token in word_tokenize(text)]
    return get_intersection(stemmed_tokens, directional_word_list)

senti_dd_positive_pair_list = ['_'.join((item1, item2)) for item1, item2 in list(senti_dd_lexicon_df[(senti_dd_lexicon_df['sentiment']=='positive')][['entity', 'directional_word']].values)]
def get_positive_pairs(entities, directional_words):
    this_pair_list = ['_'.join((item1, item2)) for item1, item2 in list(product(entities, directional_words))]
    return get_intersection(this_pair_list, senti_dd_positive_pair_list)

senti_dd_negative_pair_list = ['_'.join((item1, item2)) for item1, item2 in list(senti_dd_lexicon_df[(senti_dd_lexicon_df['sentiment']=='negative')][['entity', 'directional_word']].values)]
def get_negative_pairs(entities, directional_words):
    this_pair_list = ['_'.join((item1, item2)) for item1, item2 in list(product(entities, directional_words))]
    return get_intersection(this_pair_list, senti_dd_negative_pair_list)

def score_to_label(score):
    if score > 0: return 'positive'
    elif score < 0: return 'negative'
    else: return 'neutral'

df['lm_positive_words'] = df['headline'].progress_apply(lambda x: detect_lm_positive_words(x))
df['lm_negative_words'] = df['headline'].progress_apply(lambda x: detect_lm_negative_words(x))
df['detected_entities'] = df['headline'].progress_apply(lambda x: detect_entities(x))
df['detected_directional_words'] = df['headline'].progress_apply(lambda x: detect_directional_words(x))
df['senti_dd_pos_pairs'] = df.progress_apply(lambda x: get_positive_pairs(x['detected_entities'], x['detected_directional_words']), axis=1)
df['senti_dd_neg_pairs'] = df.progress_apply(lambda x: get_negative_pairs(x['detected_entities'], x['detected_directional_words']), axis=1)

df['senti_dd_score'] = df.progress_apply(lambda x: len(x['lm_positive_words'])+len(x['senti_dd_pos_pairs']) - len(x['lm_negative_words'])-len(x['senti_dd_neg_pairs']), axis=1)
df['senti_dd_prediction'] = df['senti_dd_score'].progress_apply(lambda x: score_to_label(x))
df['correct'] = df.apply(lambda x: x['label']==x['senti_dd_prediction'], axis=1)
df.head()

100%|████████████████████████████████████████████████████████████████████████████| 2259/2259 [00:01<00:00, 1833.17it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2259/2259 [00:01<00:00, 1162.01it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2259/2259 [00:06<00:00, 330.60it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2259/2259 [00:04<00:00, 511.88it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2259/2259 [00:00<00:00, 5189.00it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2259/2259 [00:00<00:00, 10061.31it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2259/2259 [00:00<00:00, 16715.80it/s]
100%|██████████████████████████████████████████████████████████████████████████| 2259/2259 [00:00<00:00, 184423.32it/s]


Unnamed: 0,headline,label,lm_positive_words,lm_negative_words,detected_entities,detected_directional_words,senti_dd_pos_pairs,senti_dd_neg_pairs,senti_dd_score,senti_dd_prediction,correct
0,finnish developer and manufacturer of mobile p...,negative,[],[],"[net, sale, manufacturer, year]",[],[],[],0,neutral,False
1,the transaction is expected to be finalized by...,neutral,[],[],[],[],[],[],0,neutral,True
2,okmetic board of directors has also decided on...,neutral,[],[],"[company, share]",[],[],[],0,neutral,True
3,indigo and somoncom serve 377 000 subscribers ...,neutral,[],[],[share],[],[],[],0,neutral,True
4,to see a slide show of all the newest product ...,neutral,[],[],[product],[],[],[],0,neutral,True


Save

In [4]:
df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

Created results/senti_dd_interpretability_evidence_for_appendix.csv


In [5]:
appendix_df = df[(df['detected_entities'].apply(lambda x: len(x)!=0)) & \
                 (df['detected_directional_words'].apply(lambda x: len(x)!=0)) & \
                 (df['lm_positive_words'].apply(lambda x: len(x)==0))& \
                 (df['lm_negative_words'].apply(lambda x: len(x)==0))]

result = pd.concat([appendix_df[appendix_df['correct']==True].sample(10), \
                   appendix_df[appendix_df['correct']==False].sample(10)])[['headline','label', 'senti_dd_prediction', 'senti_dd_pos_pairs', 'senti_dd_neg_pairs']]

result

Unnamed: 0,headline,label,senti_dd_prediction,senti_dd_pos_pairs,senti_dd_neg_pairs
26,m real s sales are expected to have increased ...,positive,positive,"[sale_increas, quarter_increas, year_increas]",[]
169,finnish metal industry solutions supplier outo...,positive,positive,"[solution_rose, mln_rose, profit_rose, oyj_ros...",[]
2074,adp news feb 12 2009 finnish construction comp...,negative,negative,"[construction_decreas, company_decreas]","[profit_decreas, net_decreas, eur_decreas, oyj..."
1818,in the fourth quarter of 2008 net sales increa...,positive,positive,"[quarter_increas, mn_increas, net_increas, sal...",[]
960,excluding non recurring items pre tax profit s...,positive,positive,"[profit_surg, item_surg]",[]
1992,net sales of finnish food industry company l+ñ...,positive,positive,"[sale_increas, eur_increas, mn_increas, net_in...",[company_increas]
674,eps from continuing operations came in at 0 30...,positive,positive,[eur_up],[]
693,eps for the quarter came in at 0 36 eur up fro...,positive,positive,"[eur_up, year_up, quarter_up]",[]
98,the chain posted sales of 298 million euros fo...,positive,positive,"[sale_rise, year_rise, percent_rise]",[]
52,international sales rose by 59 8 to eur 1 244 ...,positive,positive,"[eur_rose, sale_rose, mn_rose]",[]


In [6]:
save_result_filepath = save_filepath.replace('.csv', '_{}.csv'.format(len(result)))
result.to_csv(save_result_filepath, index=False)
print('Created {}'.format(save_result_filepath))

Created results/senti_dd_interpretability_evidence_for_appendix_20.csv
