### Data pre-processing

In [None]:
import os
import pandas as pd

# Loading sp500-edgar-10k (Jlohing, 2022)
path = '/content/'
parquet_files = [file for file in os.listdir(path) if file.endswith('.parquet')]

dfs = {}

for file in parquet_files:
    year = int(file.split('.')[0])
    df = pd.read_parquet(os.path.join(path, file))
    dfs[year] = df

for year, df in dfs.items():
    print(df.shape)

(478, 37)
(476, 37)
(472, 37)
(474, 37)
(483, 37)
(486, 37)
(476, 37)
(492, 37)
(488, 37)
(480, 37)


In [None]:
import re

# Forward-looking statements (FLS) extraction rules (Glodd & Hristova, 2023)
rule_2a = ['next','subsequent','following','upcoming','incoming','coming','succeeding','carryforward']
rule_2b = ['month','quarter','year','fiscal','taxable','period']
rule_3 = ["aim", "anticipate", "assume","commit", "estimate", "expect", "forecast", "foresee","hope", "intend", "plan", "predict", "project", "seek","target" ]

clean_data = pd.DataFrame()

for year, df in dfs.items():
    df.drop(columns=['cik', 'sic', 'company', 'date', 'item_1B', 'item_2', 'item_4', 'item_5', 'item_6', 'item_8', 'item_9', 'item_9A', 'item_9B', 'item_10', 'item_11', 'item_12', 'item_13', 'item_14', 'item_15', 'ret','mkt_cap', '1_day_return', '3_day_return', '10_day_return', '20_day_return', '40_day_return', '60_day_return', '80_day_return', '100_day_return', '150_day_return', '252_day_return'], inplace=True)
    df['FLS'] = None
    rule_1 = [str(i) for i in sorted(list(dfs.keys())) if i > year and i <= 2023]
    for index, row in df.iterrows():
        fls = []
        content = str(row['item_1']) + str(row['item_1A']) + str(row['item_3']) + str(row['item_7']) + str(row['item_7A'])
        sentences = re.split(r'[.!?]+', content)
        for sentence in sentences:
          for i, word in enumerate(sentence.split()):
            if word in rule_2a:
              for j in range(i + 1, len(sentence.split())):
                if sentence.split()[j] in rule_2b:
                    fls.append(sentence)
                    break
            if word in rule_1 or word in rule_3:
              fls.append(sentence)
              break
        # Retain only instances that contain at least 20 sentences
        if len(fls) > 20:
            df.at[index, 'FLS'] = fls
        # Turning stock returns into classification gold labels
        if row['5_day_return'] > 1.05:
            label = 'high'
        elif row['5_day_return'] < 0.95:
            label = 'low'
        else:
            label = 'neg'
        df.at[index, 'Label'] = label
    df = df.dropna()
    distribution = df['Label'].value_counts()
    print(distribution)
    print("Finished", year)
    clean_data = clean_data._append(df, ignore_index=True)


Label
neg     391
high     22
low      14
Name: count, dtype: int64
Finished 2015
Label
neg     348
high     34
low      24
Name: count, dtype: int64
Finished 2011
Label
neg     371
low      24
high     20
Name: count, dtype: int64
Finished 2012
Label
neg     369
high     39
low      17
Name: count, dtype: int64
Finished 2013
Label
neg     397
high     21
low      15
Name: count, dtype: int64
Finished 2017
Label
neg     330
high     93
low      15
Name: count, dtype: int64
Finished 2016
Label
neg     321
high     68
low      16
Name: count, dtype: int64
Finished 2010
Label
neg     390
high     22
low      19
Name: count, dtype: int64
Finished 2019
Label
neg     367
high     39
low      30
Name: count, dtype: int64
Finished 2018
Label
neg     397
high     29
low       5
Name: count, dtype: int64
Finished 2014


In [None]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4247 entries, 0 to 4246
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   item_1        4247 non-null   object 
 1   item_1A       4247 non-null   object 
 2   item_3        4247 non-null   object 
 3   item_7        4247 non-null   object 
 4   item_7A       4247 non-null   object 
 5   5_day_return  4247 non-null   float64
 6   FLS           4247 non-null   object 
 7   Label         4247 non-null   object 
dtypes: float64(1), object(7)
memory usage: 265.6+ KB


In [None]:
clean_data = clean_data[['FLS','Label']]
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4247 entries, 0 to 4246
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   FLS     4247 non-null   object
 1   Label   4247 non-null   object
dtypes: object(2)
memory usage: 66.5+ KB


In [None]:
distribution = clean_data['Label'].value_counts()
print(distribution)

Label
neg     3681
high     387
low      179
Name: count, dtype: int64


In [None]:
import re

# Remove unneccesary punctuation
for ind, fls in enumerate(clean_data["FLS"]):
    text = str(data['FLS'][ind])
    text = text.replace("'", "")
    text = text.replace("\\n", "")
    text = text.replace('\n', '.')
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = re.sub(r'\s+', ' ', text).strip()
    clean_data['FLS'][ind] = text

In [None]:
clean_data.to_parquet('10K_longformer.parquet')

### Undersampling Majority Class

In [None]:
data = pd.read_parquet("/content/10K_longformer.parquet")

In [None]:
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['Label'])

# Undersample majority class for better class balance
low_df = data[data['labels'] == 1]
high_df = data[data['labels'] == 0]
neg_df = data[data['labels'] == 2]

selected_high_df = high_df.sample(n=180, random_state=42)
selected_neg_df = neg_df.sample(n=480, random_state=42)

cleaned_data_adjusted = pd.concat([selected_high_df, selected_neg_df, low_df], ignore_index=True)

cleaned_data_adjusted = cleaned_data_adjusted.sample(frac=1, random_state=42).reset_index(drop=True)

cleaned_data_adjusted.to_parquet('10K_longformer_balanced.parquet')

### Longformer for Sequence Classification
*As described by Beltagy, Peters, & Cohan (2020)*

In [None]:
# !pip install accelerate -U
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-

In [None]:
import os
import torch
import pandas as pd
from collections import Counter

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from transformers import AutoTokenizer, LongformerForSequenceClassification, LongformerConfig

import torch.optim as optim
from tqdm.notebook import tqdm

In [None]:
# data = pd.read_parquet("/content/10K_longformer.parquet")
data = pd.read_parquet("/content/10K_longformer_balanced.parquet")

In [None]:
from datasets import DatasetDict, Dataset

full_data = Dataset.from_pandas(data)

train_test_split = full_data.train_test_split(test_size=0.2)

train_val_split = train_test_split['train'].train_test_split(test_size=0.25)

balanced_test = DatasetDict(
    train=train_val_split['train'],
    val=train_val_split['test'],
    test=train_test_split['test']
)

In [None]:
label_counts = Counter(balanced_test['train']['labels'])

print(label_counts)

label_counts = Counter(balanced_test['val']['labels'])

print(label_counts)

label_counts = Counter(balanced_test['test']['labels'])

print(label_counts)

Counter({2: 285, 1: 115, 0: 103})
Counter({2: 103, 0: 37, 1: 28})
Counter({2: 92, 0: 40, 1: 36})


In [None]:
balanced_test

DatasetDict({
    train: Dataset({
        features: ['FLS', 'labels'],
        num_rows: 503
    })
    val: Dataset({
        features: ['FLS', 'labels'],
        num_rows: 168
    })
    test: Dataset({
        features: ['FLS', 'labels'],
        num_rows: 168
    })
})

In [None]:
print(torch.cuda.is_available())

True


In [None]:
output_dir = '/content/output/'
model_save_dir = os.path.join('/content/saved-models/longformer-v.1')
device = torch.device('cuda')
# device = torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                            num_labels=3,
)
model = model.to(device)
# model.config.attention_window = 256

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
balanced_test = balanced_test.map(
    lambda example: tokenizer(example['FLS'], padding=True, truncation=True, max_length=4096),
    batched=True,
    batch_size=2
)

balanced_test = balanced_test.remove_columns(['FLS'])
balanced_test.set_format('torch')

Map:   0%|          | 0/503 [00:00<?, ? examples/s]

Map:   0%|          | 0/168 [00:00<?, ? examples/s]

Map:   0%|          | 0/168 [00:00<?, ? examples/s]

In [None]:
balanced_test['train'][0:2]

{'labels': tensor([2, 1]),
 'input_ids': tensor([[    0, 29116, 41220,  ...,     1,     1,     1],
         [    0,  1121,    42,  ...,  4484,  1674,     2]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]])}

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(balanced_test['train'], batch_size=2)
eval_dataloader = DataLoader(balanced_test['val'], batch_size=2)
test_dataloader = DataLoader(balanced_test['test'], batch_size=2)

In [None]:
torch.cuda.empty_cache()

In [None]:
# Longformer model requires at least 22GB GPU RAM

checkpoint_dir = 'checkpoints'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

num_epochs = 3
num_training_steps = 3 * len(train_dataloader)
optimizer = optim.Adam(model.parameters(), lr=0.001)
best_val_loss = float('inf')
progress_bar = tqdm(range(num_training_steps))
criterion = torch.nn.CrossEntropyLoss()

# Training step
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        output = model(**batch)
        logits = output.logits
        labels = batch['labels']
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        progress_bar.update(1)

# Validation step
    model.eval()
    for batch_i, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            batch = {key: value.to(device) for key, value in batch.items()}
            output = model(**batch)
        loss += output.loss

    avg_val_loss = loss / len(eval_dataloader)
    print(f"Validation loss: {avg_val_loss}")
    if avg_val_loss < best_val_loss:
        print("Saving checkpoint")
        best_val_loss = avg_val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss,
            },
            os.path.join(checkpoint_dir, f'epoch_{epoch}.pt')
                  )

  0%|          | 0/756 [00:00<?, ?it/s]

Initializing global attention on CLS token...
Input ids are automatically padded from 2384 to 2560 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1221 to 1536 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 2351 to 2560 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1229 to 1536 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 2235 to 2560 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 2988 to 3072 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 2362 to 2560 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 2508 to 2560 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 1645 to 2048 to be a multiple of `config.attention_window`: 512
Input ids are auto

Validation loss: 1.0309672355651855
Saving checkpoint
Validation loss: 1.0142749547958374
Saving checkpoint
Validation loss: 1.012397289276123
Saving checkpoint


In [None]:
# Testing step
model.eval()
test_labels = []
test_preds = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        output = model(**batch)
        logits = output.logits
        preds = torch.argmax(logits, dim=-1)
        test_labels.extend(batch['labels'].cpu().numpy())
        test_preds.extend(preds.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(test_labels, test_preds)
precision = precision_score(test_labels, test_preds, average='weighted')
recall = recall_score(test_labels, test_preds, average='weighted')
f1 = f1_score(test_labels, test_preds, average='weighted')

print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test F1 Score: {f1}")

Input ids are automatically padded from 2860 to 3072 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 2987 to 3072 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 2642 to 3072 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 3312 to 3584 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 3934 to 4096 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 2846 to 3072 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 3531 to 3584 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 2854 to 3072 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 3765 to 4096 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 3932 to 4096 to be a mul

Test Accuracy: 0.5476190476190477
Test Precision: 0.2998866213151928
Test Recall: 0.5476190476190477
Test F1 Score: 0.3875457875457876


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
report = classification_report(test_labels, test_preds)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.00      0.00      0.00        36
           2       0.55      1.00      0.71        92

    accuracy                           0.55       168
   macro avg       0.18      0.33      0.24       168
weighted avg       0.30      0.55      0.39       168



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(test_preds)

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]



### References:

*   Beltagy, Iz, Matthew E. Peters, and Arman Cohan. ‘Longformer: The Long-Document Transformer’. arXiv, 2 December 2020. https://doi.org/10.48550/arXiv.2004.05150.
*   Glodd, Alexander, and Diana Hristova. Extraction of Forward-Looking Financial Information for Stock Price Prediction from Annual Reports Using NLP Techniques, 2023. https://doi.org/10.24251/HICSS.2023.679.
*   ‘Jlohding/Sp500-Edgar-10k · Datasets at Hugging Face’. Accessed 17 June 2024. https://huggingface.co/datasets/jlohding/sp500-edgar-10k.

