### Data Pre-processing

In [60]:
import os
import pandas as pd
import re

# Loading the Loughran-McDonald Master Dictionary (LMD) - Loughran & McDonald (2011)
lmd_path = '/content/Loughran-McDonald_MasterDictionary_1993-2023.csv'

lmd = pd.read_csv(lmd_path)

lmd.shape

(86553, 17)

In [61]:
lmd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86553 entries, 0 to 86552
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Word                86552 non-null  object 
 1   Seq_num             86553 non-null  int64  
 2   Word Count          86553 non-null  int64  
 3   Word Proportion     86553 non-null  float64
 4   Average Proportion  86553 non-null  float64
 5   Std Dev             86553 non-null  float64
 6   Doc Count           86553 non-null  int64  
 7   Negative            86553 non-null  int64  
 8   Positive            86553 non-null  int64  
 9   Uncertainty         86553 non-null  int64  
 10  Litigious           86553 non-null  int64  
 11  Strong_Modal        86553 non-null  int64  
 12  Weak_Modal          86553 non-null  int64  
 13  Constraining        86553 non-null  int64  
 14  Complexity          86553 non-null  int64  
 15  Syllables           86553 non-null  int64  
 16  Sour

In [62]:
# Extracting Positive and Negative word lists
condition = (lmd['Positive'] >= 1) | (lmd['Negative'] >= 1)

lmd = lmd[condition]

lmd.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2692 entries, 9 to 86116
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Word                2692 non-null   object 
 1   Seq_num             2692 non-null   int64  
 2   Word Count          2692 non-null   int64  
 3   Word Proportion     2692 non-null   float64
 4   Average Proportion  2692 non-null   float64
 5   Std Dev             2692 non-null   float64
 6   Doc Count           2692 non-null   int64  
 7   Negative            2692 non-null   int64  
 8   Positive            2692 non-null   int64  
 9   Uncertainty         2692 non-null   int64  
 10  Litigious           2692 non-null   int64  
 11  Strong_Modal        2692 non-null   int64  
 12  Weak_Modal          2692 non-null   int64  
 13  Constraining        2692 non-null   int64  
 14  Complexity          2692 non-null   int64  
 15  Syllables           2692 non-null   int64  
 16  Source    

In [63]:
print(lmd['Negative'].value_counts())
print(lmd['Positive'].value_counts())

Negative
2009    2305
0        347
2014      26
2011      13
2012       1
Name: count, dtype: int64
Positive
0       2345
2009     345
2012       1
2011       1
Name: count, dtype: int64


In [64]:
word_set = set(lmd['Word'].str.lower().tolist())
print(word_set)



In [65]:
# Loading sp500-edgar-10k (Jlohing, 2022)
path = '/content/'
parquet_files = [file for file in os.listdir(path) if file.endswith('.parquet')]

dfs = {}

for file in parquet_files:
    year = int(file.split('.')[0])
    df = pd.read_parquet(os.path.join(path, file))
    dfs[year] = df

for year, df in dfs.items():
    print(year, df.shape)

2014 (480, 37)
2017 (483, 37)
2018 (488, 37)
2010 (476, 37)
2016 (486, 37)
2019 (492, 37)
2013 (474, 37)
2012 (472, 37)
2015 (478, 37)
2011 (476, 37)


In [66]:
clean_data = pd.DataFrame()

for year, df in dfs.items():
    print("Processing", year)
    df.drop(columns=['cik', 'sic', 'company', 'date',
                     'item_1', 'item_1B', 'item_2',
                     'item_4', 'item_5', 'item_6',
                     'item_8', 'item_9', 'item_9A',
                     'item_9B', 'item_10', 'item_11',
                     'item_12', 'item_13', 'item_14',
                     'item_15', 'ret','mkt_cap'], inplace=True)
    df = df.reset_index(drop=True)
    df['Sentiment'] = None
    df['Year'] = None
    avg = 0
    avg_raw = 0
    # Sentence extraction using LMD
    for index, row in df.iterrows():
        sentiment_list = []
        content = str(row['item_1A']) + str(row['item_3']) + str(row['item_7']) + str(row['item_7A'])
        sentences = re.split(r'[.!?]+', content)
        for sentence in sentences:
            for i, word in enumerate(sentence.split()):
                if word in word_set:
                    sentiment_list.append(sentence)
                    break
        # Remove unnecessary punctuation
        text = str(sentiment_list)
        text = text.replace("',", ".")
        text = text.replace("'", "")
        text = text.replace("\\n", "")
        text = text.replace('\n', '.')
        text = text.replace('[', '')
        text = text.replace(']', '')
        text = text.replace('•', ' ')
        text = re.sub(r'\s+', ' ', text).strip()
        df.at[index, 'Sentiment'] = text
        df.at[index, 'Year'] = year
        ratio = len(sentiment_list) * 100 / len(sentences)
        # Retain only instances with at least 10% of sentences extracted
        if ratio < 10:
            print("Removed instance at index",index,"- not enough info retained. Ratio =",ratio)
            df.drop(index, inplace = True)
        else:
            avg_raw += len(sentences)
            avg += len(sentiment_list)

    df = df.reset_index(drop=True)
    avg /= len(df)
    avg_raw /= len(df)
    print("Retained an average of",avg,"sentences out of an average total of", avg_raw)
    print("Finished", year)
    df.drop(columns=['item_1A','item_3','item_7', 'item_7A'], inplace=True)
    clean_data = clean_data._append(df, ignore_index=True)

Processing 2014
Removed instance at index 73 - not enough info retained. Ratio = 0.0
Removed instance at index 83 - not enough info retained. Ratio = 0.0
Removed instance at index 96 - not enough info retained. Ratio = 0.0
Removed instance at index 182 - not enough info retained. Ratio = 0.0
Removed instance at index 200 - not enough info retained. Ratio = 0.0
Removed instance at index 244 - not enough info retained. Ratio = 0.0
Removed instance at index 269 - not enough info retained. Ratio = 0.0
Removed instance at index 280 - not enough info retained. Ratio = 0.0
Removed instance at index 302 - not enough info retained. Ratio = 0.0
Removed instance at index 303 - not enough info retained. Ratio = 6.25
Retained an average of 297.668085106383 sentences out of an average total of 991.9404255319149
Finished 2014
Processing 2017
Removed instance at index 49 - not enough info retained. Ratio = 0.0
Removed instance at index 57 - not enough info retained. Ratio = 0.0
Removed instance at ind

In [67]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4689 entries, 0 to 4688
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   1_day_return    4689 non-null   float64
 1   3_day_return    4689 non-null   float64
 2   5_day_return    4689 non-null   float64
 3   10_day_return   4689 non-null   float64
 4   20_day_return   4689 non-null   float64
 5   40_day_return   4689 non-null   float64
 6   60_day_return   4689 non-null   float64
 7   80_day_return   4689 non-null   float64
 8   100_day_return  4689 non-null   float64
 9   150_day_return  4689 non-null   float64
 10  252_day_return  4689 non-null   float64
 11  Sentiment       4689 non-null   object 
 12  Year            4689 non-null   object 
dtypes: float64(11), object(2)
memory usage: 476.4+ KB


In [68]:
clean_data.to_parquet('10K_LMD_sent_BERT.parquet')

### Sentiment Analysis

In [69]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import torch
import scipy

df = pd.read_parquet('/content/10K_LMD_sent_BERT.parquet')

df.shape

(4689, 13)

In [70]:
nltk.download('punkt')
df['Sentences'] = df['Sentiment'].apply(sent_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [71]:
# BERT - Devlin et al., 2018
# model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# FinBERT - Araci, 2019
model = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')

In [72]:
device = torch.device('cuda')
# device = torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [73]:
pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

In [74]:
def get_sentiment_score(sentence):
    # Sentence-wise sentiment analysis
    encoded_input = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
    with torch.no_grad():
        logits = model(**encoded_input).logits
    scores = {k: v for k, v in zip(model.config.id2label.values(), scipy.special.softmax(logits.cpu().numpy().squeeze()))}
    return scores

def pool_sentiment_scores(sentiment_scores_list):
    # Logit score pooling
    num_sentences = len(sentiment_scores_list)
    if num_sentences == 0:
        return 0
    positive_sum = 0
    negative_sum = 0
    for sentiment_scores in sentiment_scores_list:
        # For BERT:
        # positive_sum += sentiment_scores['LABEL_0']
        # negative_sum += sentiment_scores['LABEL_1']
        # For FinBERT:
        positive_sum += sentiment_scores['positive']
        negative_sum += sentiment_scores['neutral']
    pooled_score = (positive_sum - negative_sum) / num_sentences
    return pooled_score

In [75]:
df['Sentiment_scores'] = df['Sentences'].apply(lambda x: [get_sentiment_score(sentence) for sentence in x])

In [76]:
df['Document_sentiment_score'] = df['Sentiment_scores'].apply(lambda x: pool_sentiment_scores(x))

In [77]:
df.drop(columns=['Sentiment', 'Sentences', 'Sentiment_scores'], inplace=True)

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4689 entries, 0 to 4688
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   1_day_return              4689 non-null   float64
 1   3_day_return              4689 non-null   float64
 2   5_day_return              4689 non-null   float64
 3   10_day_return             4689 non-null   float64
 4   20_day_return             4689 non-null   float64
 5   40_day_return             4689 non-null   float64
 6   60_day_return             4689 non-null   float64
 7   80_day_return             4689 non-null   float64
 8   100_day_return            4689 non-null   float64
 9   150_day_return            4689 non-null   float64
 10  252_day_return            4689 non-null   float64
 11  Year                      4689 non-null   int64  
 12  Document_sentiment_score  4689 non-null   float64
dtypes: float64(12), int64(1)
memory usage: 476.4 KB


In [79]:
# df.to_parquet('processed_BERT.parquet')
df.to_parquet('processed_FinBERT.parquet')

### Correlation and Logistic Regression

In [33]:
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats

# df_proc = pd.read_parquet('processed_BERT.parquet')
df_proc = pd.read_parquet('processed_FinBERT.parquet')

In [34]:
return_periods = ['1_day_return', '3_day_return', '5_day_return', '10_day_return', '20_day_return',
                  '40_day_return', '60_day_return', '80_day_return', '100_day_return',
                  '150_day_return', '252_day_return']

# Calculate correlation coefficient and p-value for each return period
for period in return_periods:
    correlation_coefficient, p_value = pearsonr(df_proc['Document_sentiment_score'], df_proc[period])
    print(f'  Correlation coefficient for {period}: {correlation_coefficient}')
    print(f'  P-value for {period}: {p_value}')

  Correlation coefficient for 1_day_return: 0.08556919292659919
  P-value for 1_day_return: 0.06466021004452369
  Correlation coefficient for 3_day_return: 0.11572423557104795
  P-value for 3_day_return: 0.012330820330012298
  Correlation coefficient for 5_day_return: 0.10171773466597445
  P-value for 5_day_return: 0.027952565264951688
  Correlation coefficient for 10_day_return: 0.05115043172204848
  P-value for 10_day_return: 0.26997068952577624
  Correlation coefficient for 20_day_return: -0.05665301717228535
  P-value for 20_day_return: 0.22171419576945614
  Correlation coefficient for 40_day_return: -0.053171077704890386
  P-value for 40_day_return: 0.25147723956642276
  Correlation coefficient for 60_day_return: -0.029201699039710677
  P-value for 60_day_return: 0.5290236605622016
  Correlation coefficient for 80_day_return: -0.03340738649446961
  P-value for 80_day_return: 0.47139816672187884
  Correlation coefficient for 100_day_return: -0.04839607579442828
  P-value for 100_da

In [35]:
df_proc.drop(columns=['10_day_return', '20_day_return',
                      '40_day_return', '60_day_return', '80_day_return', '100_day_return',
                      '150_day_return', '252_day_return'], inplace=True)

df_proc['label_3'] = None
df_proc['label_5'] = None
for index, row in df_proc.iterrows():
    if row['3_day_return'] > 1.015:
        df_proc.at[index, 'label_3'] = 'up'
    elif row['3_day_return'] < 0.985:
        df_proc.at[index, 'label_3'] = 'down'
    else:
        df_proc.at[index, 'label_3'] = 'no'
    if row['5_day_return'] > 1.015:
        df_proc.at[index, 'label_5'] = 'up'
    elif row['5_day_return'] < 0.985:
        df_proc.at[index, 'label_5'] = 'down'
    else:
        df_proc.at[index, 'label_5'] = 'no'

In [36]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X = df_proc[['Document_sentiment_score']]
y = df_proc['label_3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Original class distribution:\n", y_train.value_counts())
print("Resampled class distribution:\n", y_train_res.value_counts())

model = LogisticRegression()
model.fit(X_train_res, y_train_res)

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred)

print(f'Classification Report:\n{report}')

Original class distribution:
 label_3
no      222
up       92
down     59
Name: count, dtype: int64
Resampled class distribution:
 label_3
no      222
up      222
down    222
Name: count, dtype: int64
Classification Report:
              precision    recall  f1-score   support

        down       0.20      0.64      0.31        14
          no       1.00      0.03      0.06        62
          up       0.26      0.67      0.37        18

    accuracy                           0.24        94
   macro avg       0.49      0.45      0.25        94
weighted avg       0.74      0.24      0.16        94




### References:

*   Araci, Dogu. ‘FinBERT: Financial Sentiment Analysis with Pre-Trained Language Models’, 2019. https://doi.org/10.48550/arXiv.1908.10063.
*   Devlin, Jacob, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. ‘BERT: Pre-Training of Deep Bidirectional Transformers for Language Understanding’, 2018. https://doi.org/10.48550/arXiv.1810.04805.
*   ‘Jlohding/Sp500-Edgar-10k · Datasets at Hugging Face’. Accessed 17 June 2024. https://huggingface.co/datasets/jlohding/sp500-edgar-10k.
*   Loughran, Tim, and Bill Mcdonald. ‘When Is a Liability Not a Liability? Textual Analysis, Dictionaries, and 10‐Ks’. The Journal of Finance 66, no. 1 (February 2011): 35–65. https://doi.org/10.1111/j.1540-6261.2010.01625.x.

