In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate


## BERT Financial Sentiment

In [10]:
from datasets import load_dataset
from datasets import load_from_disk
train_dataset = load_from_disk('./twitter-financial-news-sentiment/train')

validation_dataset = load_from_disk('./twitter-financial-news-sentiment/validation')

print(train_dataset[0])
print(validation_dataset[0])


{'text': '$BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT', 'label': 0}
{'text': '$ALLY - Ally Financial pulls outlook https://t.co/G9Zdi1boy5', 'label': 0}


In [11]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
model_path = './saved_model'
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)

tokenizer = BertTokenizer.from_pretrained(model_path)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
validation_dataset = validation_dataset.remove_columns(["text"])

train_dataset.set_format("torch")
validation_dataset.set_format("torch")


Map:   0%|          | 0/9543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [4]:
# save_directory = "./saved_model"

# # Save the model
# model.save_pretrained(save_directory)

# # Save the tokenizer
# tokenizer.save_pretrained(save_directory)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,  
)

trainer.train()
# sinteractive -p gpu2 --gres=gpu:3 --mem=50G --time=12:00:00 --account=mpcs53113

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,No log,0.496802
2,0.585000,0.413007




TrainOutput(global_step=796, training_loss=0.48482224929272827, metrics={'train_runtime': 1592.8937, 'train_samples_per_second': 11.982, 'train_steps_per_second': 0.5, 'total_flos': 5021782690756608.0, 'train_loss': 0.48482224929272827, 'epoch': 2.0})

In [14]:

results = trainer.evaluate()

print(results)

{'eval_loss': 0.41300681233406067, 'eval_runtime': 78.5922, 'eval_samples_per_second': 30.385, 'eval_steps_per_second': 1.272, 'epoch': 2.0}


In [22]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)


text = "The FOMC meeting showed concerns about inflation."

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

outputs = model(**inputs)

predictions = torch.argmax(outputs.logits, dim=-1)

sentiments = {0: "Bearish", 1: "Bullish", 2: "Neutral"}
print("Predicted sentiment:", sentiments[predictions.item()])


Predicted sentiment: Neutral


In [17]:

orig_model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)

orig_tokenizer = BertTokenizer.from_pretrained(model_path)


In [20]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

orig_model.to(device)


text = "The FOMC meeting showed concerns about inflation."

inputs = orig_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

outputs = orig_model(**inputs)

predictions = torch.argmax(outputs.logits, dim=-1)

sentiments = {0: "Bearish", 1: "Bullish", 2: "Neutral"}
print("Original Predicted sentiment:", sentiments[predictions.item()])


Original Predicted sentiment: Bearish


In [25]:
trainer = Trainer(
    model=orig_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)
results = trainer.evaluate()

print(results)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'eval_loss': 1.0871833562850952, 'eval_runtime': 78.4621, 'eval_samples_per_second': 30.435, 'eval_steps_per_second': 1.275}


## Bert on FOMC to predict SPY

In [1]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
     -------------------------------------- 147.9/147.9 kB 1.1 MB/s eta 0:00:00
Collecting soupsieve>1.2
  Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.12.3 bs4-0.0.2 soupsieve-2.6



[notice] A new release of pip is available: 23.0.1 -> 24.2
[notice] To update, run: C:\Users\10029\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Download the minutes files
base_url = "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm"


output_dir = "fomc_minutes_html"
os.makedirs(output_dir, exist_ok=True)


def download_file(url, output_path):
    response = requests.get(url)
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(response.text)


def scrape_fomc_minutes_html(base_url, output_dir):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')


    for link in soup.find_all('a', href=True):
        href = link['href']
        if "minutes" in href and href.endswith(".htm"):
            html_url = urljoin(base_url, href)
            file_name = os.path.join(output_dir, os.path.basename(html_url))
            print(f"Downloading {html_url}...")
            download_file(html_url, file_name)
            print(f"Saved to {file_name}")


if __name__ == "__main__":
    scrape_fomc_minutes_html(base_url, output_dir)
    print("Download complete.")

Downloading https://www.federalreserve.gov/monetarypolicy/fomcminutes20240131.htm...
Saved to fomc_minutes_html\fomcminutes20240131.htm
Downloading https://www.federalreserve.gov/monetarypolicy/fomcminutes20240320.htm...
Saved to fomc_minutes_html\fomcminutes20240320.htm
Downloading https://www.federalreserve.gov/monetarypolicy/fomcminutes20240501.htm...
Saved to fomc_minutes_html\fomcminutes20240501.htm
Downloading https://www.federalreserve.gov/monetarypolicy/fomcminutes20240612.htm...
Saved to fomc_minutes_html\fomcminutes20240612.htm
Downloading https://www.federalreserve.gov/monetarypolicy/fomcminutes20230201.htm...
Saved to fomc_minutes_html\fomcminutes20230201.htm
Downloading https://www.federalreserve.gov/monetarypolicy/fomcminutes20230322.htm...
Saved to fomc_minutes_html\fomcminutes20230322.htm
Downloading https://www.federalreserve.gov/monetarypolicy/fomcminutes20230503.htm...
Saved to fomc_minutes_html\fomcminutes20230503.htm
Downloading https://www.federalreserve.gov/monet

In [3]:
import os
from bs4 import BeautifulSoup

# Text extraction from html
def extract_text_from_html(directory):
    for html_file in os.listdir(directory):
        if html_file.endswith('.htm'):
            html_path = os.path.join(directory, html_file)
            with open(html_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')

                # Example: Extract all paragraph text
                text = ''
                for p in soup.find_all('p'):
                    text += p.get_text() + '\n'

                # Save the extracted text
                text_file = html_file.replace('.htm', '.txt')
                with open(os.path.join(directory, text_file), 'w', encoding='utf-8') as text_output:
                    text_output.write(text)

                print(f"Extracted text from {html_file} to {text_file}")


if __name__ == "__main__":
    extract_text_from_html('fomc_minutes_html')

Extracted text from fomcminutes20190130.htm to fomcminutes20190130.txt
Extracted text from fomcminutes20190320.htm to fomcminutes20190320.txt
Extracted text from fomcminutes20190501.htm to fomcminutes20190501.txt
Extracted text from fomcminutes20190619.htm to fomcminutes20190619.txt
Extracted text from fomcminutes20190731.htm to fomcminutes20190731.txt
Extracted text from fomcminutes20190918.htm to fomcminutes20190918.txt
Extracted text from fomcminutes20191030.htm to fomcminutes20191030.txt
Extracted text from fomcminutes20191211.htm to fomcminutes20191211.txt
Extracted text from fomcminutes20200129.htm to fomcminutes20200129.txt
Extracted text from fomcminutes20200315.htm to fomcminutes20200315.txt
Extracted text from fomcminutes20200429.htm to fomcminutes20200429.txt
Extracted text from fomcminutes20200610.htm to fomcminutes20200610.txt
Extracted text from fomcminutes20200729.htm to fomcminutes20200729.txt
Extracted text from fomcminutes20200916.htm to fomcminutes20200916.txt
Extrac

In [4]:
!pip install yfinance


Collecting yfinance
  Downloading yfinance-0.2.41-py2.py3-none-any.whl (73 kB)
     -------------------------------------- 73.5/73.5 kB 800.8 kB/s eta 0:00:00
Collecting multitasking>=0.0.7
  Downloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Collecting platformdirs>=2.0.0
  Downloading platformdirs-4.2.2-py3-none-any.whl (18 kB)
Collecting peewee>=3.16.2
  Downloading peewee-3.17.6.tar.gz (3.0 MB)
     ---------------------------------------- 3.0/3.0 MB 8.6 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting html5lib>=1.1
  Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
     -------------------------------------- 112.2/112.2 kB 6.8 MB/s eta 0:00:00
Collecting lxml>=4.9.


[notice] A new release of pip is available: 23.0.1 -> 24.2
[notice] To update, run: C:\Users\10029\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [5]:
import os
import yfinance as yf
import pandas as pd

# Prepare the data file for the second task
fomc_dir = 'fomc_minutes_html'

fomc_dates = []

# Extract dates from the filenames in the directory
for filename in os.listdir(fomc_dir):
    if filename.startswith('fomcminutes') and filename.endswith('.txt'):
        date_str = filename[len('fomcminutes'):-len('.txt')]
        # Format date string to YYYY-MM-DD
        formatted_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
        fomc_dates.append(formatted_date)

fomc_dates = sorted(fomc_dates)

print("Extracted FOMC Dates:", fomc_dates)

# Download SPY data for the extracted dates
spy_data = yf.download('SPY', start=min(fomc_dates), end=max(fomc_dates))

# Filter the data for the FOMC dates
spy_data = spy_data[spy_data.index.isin(fomc_dates)]
spy_data = spy_data[['Open', 'Close']]
spy_data['Date'] = spy_data.index

# Calculate the difference between open and close prices
spy_data['Price_Difference'] = spy_data['Close'] - spy_data['Open']

print(spy_data)

aligned_data = []

# Align FOMC minutes with SPY data
for fomc_date in fomc_dates:
    fomc_filename = f"fomcminutes{fomc_date.replace('-', '')}.txt"
    fomc_filepath = os.path.join(fomc_dir, fomc_filename)

    if os.path.exists(fomc_filepath):
        with open(fomc_filepath, 'r', encoding='utf-8') as file:
            fomc_text = file.read()

        if fomc_date in spy_data.index:
            price_difference = spy_data.loc[fomc_date, 'Price_Difference']

            aligned_data.append({
                'Date': fomc_date,
                'FOMC_Minutes_Text': fomc_text,
                'SPY_Price_Difference': price_difference
            })
    else:
        print(f"File not found: {fomc_filepath}")

# Convert the aligned data to a DataFrame
aligned_df = pd.DataFrame(aligned_data)

print(aligned_df)

# Save the aligned data to a CSV file
aligned_df.to_csv('fomc_spy_aligned_data.csv', index=False)

Extracted FOMC Dates: ['2019-01-30', '2019-03-20', '2019-05-01', '2019-06-19', '2019-07-31', '2019-09-18', '2019-10-30', '2019-12-11', '2020-01-29', '2020-03-15', '2020-04-29', '2020-06-10', '2020-07-29', '2020-09-16', '2020-11-05', '2020-12-16', '2021-01-27', '2021-03-17', '2021-04-28', '2021-06-16', '2021-07-28', '2021-09-22', '2021-11-03', '2021-12-15', '2022-01-26', '2022-03-16', '2022-05-04', '2022-06-15', '2022-07-27', '2022-09-21', '2022-11-02', '2022-12-14', '2023-02-01', '2023-03-22', '2023-05-03', '2023-06-14', '2023-07-26', '2023-09-20', '2023-11-01', '2023-12-13', '2024-01-31', '2024-03-20', '2024-05-01', '2024-06-12']


[*********************100%%**********************]  1 of 1 completed
  spy_data = spy_data[spy_data.index.isin(fomc_dates)]


                  Open       Close       Date  Price_Difference
Date                                                           
2019-01-30  265.100006  267.579987 2019-01-30          2.479980
2019-03-20  282.160004  281.549988 2019-03-20         -0.610016
2019-05-01  294.720001  291.809998 2019-05-01         -2.910004
2019-06-19  292.549988  293.059998 2019-06-19          0.510010
2019-07-31  300.989990  297.429993 2019-07-31         -3.559998
2019-09-18  300.489990  301.100006 2019-09-18          0.610016
2019-10-30  303.429993  304.140015 2019-10-30          0.710022
2019-12-11  314.029999  314.420013 2019-12-11          0.390015
2020-01-29  328.380005  326.619995 2020-01-29         -1.760010
2020-04-29  291.529999  293.209991 2020-04-29          1.679993
2020-06-10  321.420013  319.000000 2020-06-10         -2.420013
2020-07-29  322.119995  325.119995 2020-07-29          3.000000
2020-09-16  341.510010  338.820007 2020-09-16         -2.690002
2020-11-05  349.239990  350.239990 2020-

In [11]:
aligned_df = pd.read_csv('fomc_spy_aligned_data.csv')

# Specify the date you want to view
date_to_view = '2019-07-31'

# Filter the DataFrame to get the row for the specific date
row_for_date = aligned_df[aligned_df['Date'] == date_to_view]

# Display the result
print(row_for_date)

         Date                                  FOMC_Minutes_Text  \
4  2019-07-31  \nThe Federal Reserve, the central bank of the...   

   SPY_Price_Difference  
4             -3.559998  


### Longer Holding periods Data Prep

In [1]:
import os
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

fomc_dir = 'fomc_minutes_html'

fomc_dates = []

for filename in os.listdir(fomc_dir):
    if filename.startswith('fomcminutes') and filename.endswith('.txt'):
        date_str = filename[len('fomcminutes'):-len('.txt')]
        # Format date string to YYYY-MM-DD
        formatted_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
        fomc_dates.append(formatted_date)

fomc_dates = sorted(fomc_dates)
aligned_data = []

# Fetch SPY prices and calculate the price gap
for fomc_date in fomc_dates:
    fomc_datetime = datetime.strptime(fomc_date, "%Y-%m-%d")
    next_day = fomc_datetime + timedelta(days=1)
    one_month_later = next_day + timedelta(days=30)

    spy_data = yf.download('SPY', start=next_day.strftime(
        '%Y-%m-%d'), end=one_month_later.strftime('%Y-%m-%d'))

    if not spy_data.empty:
        # Extract the opening price of the next day and the closing price one month later
        opening_price_next_day = spy_data.iloc[0]['Open']
        closing_price_one_month_later = spy_data.iloc[-1]['Close']

        # Calculate the price gap
        price_gap = closing_price_one_month_later - opening_price_next_day

        # Read the text content of the FOMC minutes
        fomc_filename = f"fomcminutes{fomc_date.replace('-', '')}.txt"
        fomc_filepath = os.path.join(fomc_dir, fomc_filename)

        if os.path.exists(fomc_filepath):
            with open(fomc_filepath, 'r', encoding='utf-8') as file:
                fomc_text = file.read()

            # Append the data to the list
            aligned_data.append({
                'Date': fomc_date,
                'FOMC_Minutes_Text': fomc_text,
                'SPY_Price_Gap': price_gap,
                'Opening_Price_Next_Day': opening_price_next_day,
                'Closing_Price_One_Month_Later': closing_price_one_month_later
            })
    else:
        print(f"No SPY data found for the period following {fomc_date}")

aligned_df = pd.DataFrame(aligned_data)

print(aligned_df)

aligned_df.to_csv('longer_fomc_spy_price_gap_data.csv', index=False)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

          Date                                  FOMC_Minutes_Text  \
0   2019-01-30  \nThe Federal Reserve, the central bank of the...   
1   2019-03-20  \nThe Federal Reserve, the central bank of the...   
2   2019-05-01  \nThe Federal Reserve, the central bank of the...   
3   2019-06-19  \nThe Federal Reserve, the central bank of the...   
4   2019-07-31  \nThe Federal Reserve, the central bank of the...   
5   2019-09-18  \nThe Federal Reserve, the central bank of the...   
6   2019-10-30  \nThe Federal Reserve, the central bank of the...   
7   2019-12-11  \nThe Federal Reserve, the central bank of the...   
8   2020-01-29  \nThe Federal Reserve, the central bank of the...   
9   2020-03-15  \nThe Federal Reserve, the central bank of the...   
10  2020-04-29  \nThe Federal Reserve, the central bank of the...   
11  2020-06-10  \nThe Federal Reserve, the central bank of the...   
12  2020-07-29  \nThe Federal Reserve, the central bank of the...   
13  2020-09-16  \nThe Federal Rese




In [2]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.44.0-py3-none-any.whl (9.5 MB)
     ---------------------------------------- 9.5/9.5 MB 11.4 MB/s eta 0:00:00
Collecting tokenizers<0.20,>=0.19
  Downloading tokenizers-0.19.1-cp310-none-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 17.7 MB/s eta 0:00:00
Collecting tqdm>=4.27
  Downloading tqdm-4.66.5-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.4/78.4 kB 4.3 MB/s eta 0:00:00
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0.2-cp310-cp310-win_amd64.whl (161 kB)
     -------------------------------------- 161.8/161.8 kB 9.5 MB/s eta 0:00:00
Collecting regex!=2019.12.17
  Downloading regex-2024.7.24-cp310-cp310-win_amd64.whl (269 kB)
     ------------------------------------- 269.7/269.7 kB 16.2 MB/s eta 0:00:00
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.4-cp310-none-win_amd64.whl (285 kB)
     ------------------------------------- 285.9/285.9 kB 18.4 MB/s et


[notice] A new release of pip is available: 23.0.1 -> 24.2
[notice] To update, run: C:\Users\10029\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

save_directory = './saved_roberta_model'

tokenizer.save_pretrained(save_directory)

model.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Pr

Model and tokenizer saved to ./saved_roberta_model


: 