In [1]:
import os
import glob
from tqdm import tqdm
tqdm.pandas()
import pandas as pd
from reports import create_confusion_matrix, create_classification_report, average_classification_report, sum_confusion_matrix
from lexicons import afinn_polarity, sentistrength_polarity, mpqa_polarity, socal_polarity, sentiment140_polarity

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/dmlab/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
root_dir = './data' 
train_filepaths = sorted(glob.glob(os.path.join(root_dir, 'FinancialPhrase*', '*', 'train.csv')))

Predict polarity based on lexicons

In [3]:
lexicons = [(sentiment140_polarity, 'Sen140'), (socal_polarity, 'SO-CAL'), (mpqa_polarity, 'MPQA'), \
            (afinn_polarity, 'AFINN'),(sentistrength_polarity, 'SentiStrength')]

for train_filepath in train_filepaths:
    test_filepath = train_filepath.replace('train.csv', 'test.csv')

    for lexicon_func, lexicon_name in lexicons:
        df_filepath = os.path.join(os.path.dirname(test_filepath).replace('data','results'), 'predictions_{}.csv')
        conf_filepath = os.path.join(os.path.dirname(test_filepath).replace('data','results'), 'confusion_matrix_{}.csv')
        report_filepath = os.path.join(os.path.dirname(test_filepath).replace('data','results'), 'classification_report_{}.csv')

        df = pd.read_csv(test_filepath)     
        df['prediction'] = df['headline'].progress_apply(lambda x: lexicon_func(x))
        
        df['correct'] = df.apply(lambda x: x['label']==x['prediction'], axis=1)
        df.to_csv(df_filepath.format(lexicon_name), index=False)

        labels, preds = df.label, df.prediction
        create_confusion_matrix(labels, preds, conf_filepath.format(lexicon_name))
        accuracy = len(df[df['correct']==True]) / len(df)
        create_classification_report(labels, preds, accuracy, report_filepath.format(lexicon_name))

100%|██████████| 968/968 [00:00<00:00, 5568.31it/s]


Created ./results/FinancialPhraseBank_DS50_k-folds/fold=0/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS50_k-folds/fold=0/classification_report_Sen140.csv


100%|██████████| 968/968 [00:00<00:00, 6176.01it/s]


Created ./results/FinancialPhraseBank_DS50_k-folds/fold=1/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS50_k-folds/fold=1/classification_report_Sen140.csv


100%|██████████| 967/967 [00:00<00:00, 6592.45it/s]


Created ./results/FinancialPhraseBank_DS50_k-folds/fold=2/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS50_k-folds/fold=2/classification_report_Sen140.csv


100%|██████████| 967/967 [00:00<00:00, 6613.02it/s]


Created ./results/FinancialPhraseBank_DS50_k-folds/fold=3/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS50_k-folds/fold=3/classification_report_Sen140.csv


100%|██████████| 965/965 [00:00<00:00, 6354.75it/s]


Created ./results/FinancialPhraseBank_DS50_k-folds/fold=4/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS50_k-folds/fold=4/classification_report_Sen140.csv


100%|██████████| 843/843 [00:00<00:00, 6649.35it/s]


Created ./results/FinancialPhraseBank_DS66_k-folds/fold=0/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS66_k-folds/fold=0/classification_report_Sen140.csv


100%|██████████| 843/843 [00:00<00:00, 6708.93it/s]


Created ./results/FinancialPhraseBank_DS66_k-folds/fold=1/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS66_k-folds/fold=1/classification_report_Sen140.csv


100%|██████████| 842/842 [00:00<00:00, 6733.68it/s]


Created ./results/FinancialPhraseBank_DS66_k-folds/fold=2/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS66_k-folds/fold=2/classification_report_Sen140.csv


100%|██████████| 841/841 [00:00<00:00, 6722.69it/s]


Created ./results/FinancialPhraseBank_DS66_k-folds/fold=3/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS66_k-folds/fold=3/classification_report_Sen140.csv


100%|██████████| 840/840 [00:00<00:00, 6666.58it/s]


Created ./results/FinancialPhraseBank_DS66_k-folds/fold=4/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS66_k-folds/fold=4/classification_report_Sen140.csv


100%|██████████| 690/690 [00:00<00:00, 6692.25it/s]


Created ./results/FinancialPhraseBank_DS75_k-folds/fold=0/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS75_k-folds/fold=0/classification_report_Sen140.csv


100%|██████████| 690/690 [00:00<00:00, 6760.93it/s]


Created ./results/FinancialPhraseBank_DS75_k-folds/fold=1/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS75_k-folds/fold=1/classification_report_Sen140.csv


100%|██████████| 689/689 [00:00<00:00, 6691.39it/s]


Created ./results/FinancialPhraseBank_DS75_k-folds/fold=2/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS75_k-folds/fold=2/classification_report_Sen140.csv


100%|██████████| 689/689 [00:00<00:00, 6704.10it/s]


Created ./results/FinancialPhraseBank_DS75_k-folds/fold=3/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS75_k-folds/fold=3/classification_report_Sen140.csv


100%|██████████| 689/689 [00:00<00:00, 6684.54it/s]


Created ./results/FinancialPhraseBank_DS75_k-folds/fold=4/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DS75_k-folds/fold=4/classification_report_Sen140.csv


100%|██████████| 453/453 [00:00<00:00, 6764.65it/s]


Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=0/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=0/classification_report_Sen140.csv


100%|██████████| 452/452 [00:00<00:00, 6712.83it/s]


Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=1/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=1/classification_report_Sen140.csv


100%|██████████| 452/452 [00:00<00:00, 6713.33it/s]


Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=2/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=2/classification_report_Sen140.csv


100%|██████████| 451/451 [00:00<00:00, 7010.87it/s]


Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=3/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=3/classification_report_Sen140.csv


100%|██████████| 451/451 [00:00<00:00, 6947.14it/s]


Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=4/confusion_matrix_Sen140.csv
Created ./results/FinancialPhraseBank_DSAll_k-folds/fold=4/classification_report_Sen140.csv
