# Predicting Stock Price Movement Through Natural Language Processing for SEC Filings

By Kevin Zhou (klz23), Peter Wu (plw53), Ashley Young (acy39)



## Dependencies

First, let us import all necessary dependencies, and do some static analysis on them so that we can deploy this code in the webapp.

In [None]:
import pandas as pd
import numpy as np
import torch
import os
import json
from datetime import datetime
import requests
import re
import nltk
import textblob 
from textblob import TextBlob
import pickle as pkl
from IPython.display import display
from IPython.core.magic import register_line_cell_magic
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import lightgbm 
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import xgboost
from google.colab import drive

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Get actual requirements.txt for use in the webapp
from pip._internal.utils.misc import get_installed_distributions
import sys
#import numpy as np # imported to test whether numpy shows up, which it does!

def get_imported_packages():
    p = get_installed_distributions()
    p = {package.key:package.version for package in p}

    imported_modules = set(sys.modules.keys())
    
    imported_modules.remove('pip')

    modules = [(m, p[m]) for m in imported_modules if p.get(m, False)]

    return modules


def generate_requirements(filepath:str, modules):
    with open(filepath, 'w') as f:
        for module, version in modules:
            f.write(f"{module}=={version}\n")


generate_requirements('requirements.txt', get_imported_packages())
!cat requirements.txt

# Preparing the Dataset

In [None]:
# Getting stock price changes (by percent of change), keys are stock tickers
drive.mount('drive')
price_change = pkl.load(open('drive/My Drive/ai-prac/stock_prices.pkl', 'rb'))

Mounted at drive


In [None]:
price_change30 = pkl.load(open('drive/My Drive/ai-prac/stock_prices30.pkl', 'rb'))

In [None]:
# Reading in cik to symbol mapping
symbol_cik = pd.read_excel('drive/My Drive/ai-prac/F500_CIKs.xlsx')

# Dataframe indexed by CIK with corresponding symbol
symbol_cik = symbol_cik.set_index(symbol_cik['CIK'])
symbol_cik = symbol_cik.drop(columns = ['CIK'])

## Web Scraping

EDGAR-Corpus paper: https://arxiv.org/pdf/2109.14394.pdf

In [None]:
# Scraping using edgar-crawler from the EDGAR-CORPUS paper
!rm -rf edgar-crawler/
!git clone https://github.com/nlpaueb/edgar-crawler.git
!pip install -r edgar-crawler/requirements.txt
%cd edgar-crawler/

# Allow programmatic templates to file system
# This allows us to add all the S&P 500 tickers to the config.json
# to be extracted as 10-Ks from EDGAR.
@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

In [None]:
config_json_str_init = """
{
	"edgar_crawler": {
		"start_year": 2016,
		"end_year": 2020,
		"quarters": [1, 2, 3, 4],
		"filing_types": ["10-K"],
		"user_agent": "Your name (your email)",
		"raw_filings_folder": "RAW_FILINGS",
		"indices_folder": "INDICES",
		"filings_metadata_file": "FILINGS_METADATA.csv",
		"skip_present_indices": true
	},
	"extract_items": {
		"raw_filings_folder": "RAW_FILINGS",
		"extracted_filings_folder": "EXTRACTED_FILINGS",
		"filings_metadata_file": "FILINGS_METADATA.csv",
		"items_to_extract": [
			"1", "1A", "1B", "2", "3", "4", "5", "6", "7", "7A",
			"8", "9", "9A", "9B", "10", "11", "12", "13", "14", "15"
		],
		"remove_tables": true,
		"skip_extracted_filings": true
	}
}
"""
config_json = json.loads(config_json_str_init)
config_json["edgar_crawler"]["cik_tickers"] = [ticker for ticker in price_change]

config_json_str = json.dumps(config_json)

print(config_json_str)
print(config_json_str, file=open('config.json', 'w'))

!cat config.json

{"edgar_crawler": {"start_year": 2016, "end_year": 2020, "quarters": [1, 2, 3, 4], "filing_types": ["10-K"], "user_agent": "Your name (your email)", "raw_filings_folder": "RAW_FILINGS", "indices_folder": "INDICES", "filings_metadata_file": "FILINGS_METADATA.csv", "skip_present_indices": true, "cik_tickers": ["MMM", "AOS", "ABT", "ABBV", "ABMD", "ACN", "ATVI", "ADM", "ADBE", "ADP", "AAP", "AES", "AFL", "A", "APD", "AKAM", "ALK", "ALB", "ARE", "ALGN", "ALLE", "LNT", "ALL", "GOOGL", "GOOG", "MO", "AMZN", "AMCR", "AMD", "AEE", "AAL", "AEP", "AXP", "AIG", "AMT", "AWK", "AMP", "ABC", "AME", "AMGN", "APH", "ADI", "ANSS", "AON", "APA", "AAPL", "AMAT", "APTV", "ACGL", "ANET", "AJG", "AIZ", "T", "ATO", "ADSK", "AZO", "AVB", "AVY", "BKR", "BALL", "BAC", "BBWI", "BAX", "BDX", "WRB", "BRK-B", "BBY", "BIO", "TECH", "BIIB", "BLK", "BK", "BA", "BKNG", "BWA", "BXP", "BSX", "BMY", "AVGO", "BR", "BRO", "BF-B", "CHRW", "CDNS", "CZR", "CPT", "CPB", "COF", "CAH", "KMX", "CCL", "CTLT", "CAT", "CBOE", "CBRE",

In [None]:
!ls
!python edgar_crawler.py

config.json  edgar_crawler.py  __init__.py  logger.py  README.md
datasets     extract_items.py  LICENSE	    logs       requirements.txt
Saving log to /content/edgar-crawler/logs

Downloading EDGAR Index files
2016_QTR1.tsv downloaded
2016_QTR2.tsv downloaded
2016_QTR3.tsv downloaded
2016_QTR4.tsv downloaded
2017_QTR1.tsv downloaded
2017_QTR2.tsv downloaded
2017_QTR3.tsv downloaded
2017_QTR4.tsv downloaded
2018_QTR1.tsv downloaded
2018_QTR2.tsv downloaded
2018_QTR3.tsv downloaded
2018_QTR4.tsv downloaded
2019_QTR1.tsv downloaded
2019_QTR2.tsv downloaded
2019_QTR3.tsv downloaded
2019_QTR4.tsv downloaded
2020_QTR1.tsv downloaded
2020_QTR2.tsv downloaded
2020_QTR3.tsv downloaded
2020_QTR4.tsv downloaded
NumExpr defaulting to 2 threads.

Downloading 2384 filings...

100%|███████████████████████████████████████████████████████████| 2384/2384 [34:30<00:00,  1.15it/s]

Filings metadata exported to /content/edgar-crawler/datasets/FILINGS_METADATA.csv


In [None]:
!python extract_items.py

In [None]:
!ls datasets/EXTRACTED_FILINGS

1018724_10K_2018_0001018724-19-000004.json
1018724_10K_2019_0001018724-20-000004.json
320193_10K_2019_0000320193-19-000119.json
320193_10K_2020_0000320193-20-000096.json


## Data Preprocessing

In [None]:
# Lemmatizing the text
def lemmatize(text):
  words = nltk.word_tokenize(text)
  lem_words = [WordNetLemmatizer().lemmatize(s.lower()) for s in words]
  result = ' '.join(lem_words)
  return result
  

In [None]:
# Remove everything but the word stems for each word in the text
def stem(text):
  ps = PorterStemmer()
  words = nltk.word_tokenize(text)
  stemmed_words = [ps.stem(s) for s in words]
  result = ' '.join(stemmed_words)
  return result

In [None]:
# Remove Punctuation from the text
def removePunc(text):
  return re.sub(r'[^\w\s]', '', text)

In [None]:
def removeBreaks(text):
  text = text.replace('\\n', '')
  text = text.replace('\\t', '')
  text = text.replace('\\r', '')
  return text

## Sentiment Analysis
FinBERT Paper: https://arxiv.org/pdf/1908.10063.pdf

In [None]:
def compute_finbert_probabilities(text):
  """
  Uses FinBERT via REST to compute the sentiment analysis logits
  given a string of text

  Return format:
  {"POSITIVE": number, "NEGATIVE": number, "NEUTRAL": number}
  """
  url = "https://finbert3.p.rapidapi.com/sentiment/en"

  payload = {"text": text}
  headers = {
    "content-type": "application/json",
    "X-RapidAPI-Key": "3ccadbbca0msh476e5295a20c3d1p15ea35jsn05ff8341a2d1",
    "X-RapidAPI-Host": "finbert3.p.rapidapi.com"
  }

  response = requests.request("POST", url, json=payload, headers=headers)

  res_dict = json.loads(response.text)
  return res_dict["sentiment_probabilities"]


In [None]:
# Useful constants 
FILINGS_BASE_PATH = "drive/My Drive/extracted-data/json/"
ITEMS_TO_CONCAT = [
   "7"
]
ITEMS_TO_DROP = [
  "1", "1A", "1B", "2", "3", "4", "5", "6", "7A",
  "8", "9", "9A", "9B", "10", "11", "12", "13", "14", "15"
]

In [None]:
# Reads in text and tokenizes the text into sentences, which are then ordered by subjectivity using the TextBlob nlp library 
def most_subjective(text):
  sentences = nltk.sent_tokenize(text)
  scores = []
  for sentence in sentences:
    subjectivity = TextBlob(sentence).sentiment.subjectivity
    scores.append((sentence, subjectivity))
  scores.sort(key = lambda x : x[1], reverse = True)
  total_char = 0
  text_sents = []
  for tup in scores:
    total_char += len(tup[0])
    if total_char < 2000:
      text_sents.append(tup[0])
    elif len(text_sents) > 0:
      break
  result = ' '.join(text_sents)
  return result

In [None]:

drive.mount('drive')

# load 10-ks from Colab's /content/edgar-crawler/datasets
# load 10-ks from Colab's /content/edgar-crawler/datasets
count = 0
# iterate through 10-K filings 
for json_file in os.listdir(FILINGS_BASE_PATH):
  absolute_path = FILINGS_BASE_PATH + json_file
  # Load json file into a python dictionary
  with open(absolute_path) as f:
    data = json.load(f)

  # Create a pandas dataframe from the dictionary
  df = pd.DataFrame.from_dict(data, orient='index')

  # Transpose the dataframe so that the json data is in one row
  df = df.transpose()
  if (len(main_df.loc[(main_df['cik'] == df['cik'][0]) & (main_df['filing_date'] == df['filing_date'][0])]) != 0):
    continue

  # drop unneeded columns
  df.drop(columns=['company', 'filing_type', 'period_of_report', 'sic', 
                   'state_of_inc', 'state_location', 'fiscal_year_end', 
                   'filing_html_index', 'htm_filing_link',
                   'complete_text_filing_link', 'filename', 'filing_html_index'],
          inplace=True)
  df['text'] = ''

  # iterate through the MD&A sections 
  for item_name in ITEMS_TO_CONCAT:
    col_name = "item_" + item_name

    if col_name in df:
      df['text'][0] = df['text'][0] + df[col_name][0]
      df['text'] = removeBreaks((df['text']))
      df.drop(columns=[col_name], inplace=True)

  # process the MD&A text block
  df['text'][0] = most_subjective(df['text'][0])
  df['text'] = lemmatize((df['text'][0]))
  df['text'] = stem(df['text'][0])
  df['text'] = removePunc(df['text'][0])
  
  dt = datetime.strptime(df['filing_date'][0], '%Y-%m-%d')
  
  next_year = dt.year + 1
  cik = int(df['cik'][0])

  ticker = symbol_cik.loc[cik]['Symbol']
  ticker = ticker.replace(".", "-")
  if (isinstance(ticker, pd.Series)):
    continue
  stock_change = price_change[ticker][next_year]
  df['stock_change'] = stock_change
  
  sentiment_scores = compute_finbert_probabilities(str(df['text']))
  positivity = sentiment_scores['POSITIVE']
  neutral = sentiment_scores['NEUTRAL']
  negativity = sentiment_scores['NEGATIVE']
  df['positivity'] = positivity
  df['neutral'] = neutral
  df['negativity'] = negativity
  main_df = pd.concat([df, main_df])
  print("COUNT " + str(count))
  display(df) 



In [None]:
# Saving sentiment scores to drive as a CSV
main_df = main_df.drop_duplicates(
  subset = ['cik', 'filing_date'],
  keep = 'last').reset_index(drop = True)
for drop_item in ITEMS_TO_DROP:
    main_df.drop(columns=["item_" + drop_item],
          inplace=True, errors='ignore')
drive.mount('drive')
main_df.to_csv('filings_sentiments.csv')
!cp filings_sentiments.csv "drive/My Drive/ai-prac"

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


## Regression Modeling


In [None]:
# Read in the csv to retrieve the created dataset with sentiment scores 
df = pd.read_csv('drive/My Drive/ai-prac/filings_sentiments_stock.csv')
df['net_positivity'] = df['positivity'] - df['negativity']

In [None]:
# read in the stock price change over 30 days
price_change_filing = pd.read_csv('drive/My Drive/ai-prac/stock_prices_filing.csv')

In [None]:
# read in the stock price change over 7 days
price_change_filing7 = pd.read_csv('drive/My Drive/ai-prac/stock_prices_filing_7.csv')

In [None]:
# read in the stock price change over 15 days 
price_change_filing15 = pd.read_csv('drive/My Drive/ai-prac/stock_prices_filing_15.csv')

In [None]:
# adding in stock price change over 7 days, 15 days, and 30 days to the data set
df['stock_change_filing'] = price_change_filing['Stock Change']
df['stock_change_filing7'] = price_change_filing7['Stock Change']
df['stock_change_filing15'] = price_change_filing15['Stock Change']

In [None]:
# Setting up train and test splits
'''
df['net_positivity'] = df['net_positivity'] 
df['positivity'] = df['positivity'] 
df['negativity'] = df['negativity'] 
df['neutral'] = df['neutral'] 
'''
 
X = df[['positivity', 'neutral', 'negativity']]
Y = df['stock_change_filing']

# train test split 
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size =0.20)

In [None]:
# Linear Regression
LR = LinearRegression()
LR.fit(x_train, y_train)
predictions = LR.predict(x_test)
print(r2_score(y_test, predictions))

pkl.dump(LR, open('drive/My Drive/ai-prac/LR.pkl', 'wb'))
# print(predictions)

In [None]:
# Light GBM

lgbm = LGBMRegressor()
lgbm.fit(x_train, y_train)
lgbm.score(x_test, y_test)
predictions = lgbm.predict(x_test)
print(r2_score(y_test, predictions))

pkl.dump(lgbm, open('drive/My Drive/ai-prac/lgbm.pkl', 'wb'))

In [None]:
# Light GBM Graphing

predictions = lgbm.predict(x_test)
plt.ylabel('Actual Stock Change Percent')
plt.title('Actual vs Predicted Values')
plt.xlabel('Predicted Stock Change Percent')
plt.scatter(100 * predictions, 100 * y_test)

In [None]:
plt.ylabel('Predicted Stock Change Percent')
plt.title('Positivity Score vs Predicted Stock Change Percent')
plt.xlabel('Positivity Score')
plt.scatter(x_test['positivity'], predictions * 100)

In [None]:
# LR Graphing
import matplotlib.pyplot as plt

predictions = LR.predict(x_test)
plt.ylabel('Actual Stock Change Percent')
plt.title('Actual vs Predicted Values')
plt.xlabel('Predicted Stock Change Percent')
plt.scatter(100 * predictions, 100 * y_test)

In [None]:
plt.ylabel('Predicted Stock Change Percent')
plt.title('Positivity Score vs Predicted Stock Change Percent')
plt.xlabel('Positivity Score')
plt.scatter(x_test['positivity'], predictions * 100)

In [None]:
# Random Forest
RF = RandomForestRegressor(n_estimators = 100, random_state = 0)
RF.fit(x_train, y_train) 
predictions = RF.predict(x_test)
print(r2_score(y_test, predictions))

pkl.dump(RF, open('drive/My Drive/ai-prac/RF.pkl', 'wb'))

In [None]:
#RF Graphing
predictions = RF.predict(x_test)
plt.ylabel('Actual Stock Change Percent')
plt.title('Actual vs Predicted Values')
plt.xlabel('Predicted Stock Change Percent')
plt.scatter(100 * predictions, 100 * y_test)

In [None]:
plt.ylabel('Predicted Stock Change Percent')
plt.title('Positivity Score vs Predicted Stock Change Percent')
plt.xlabel('Positivity Score')
plt.scatter(x_test['positivity'], predictions * 100)

In [None]:
# XGBoost
import xgboost 
xgb = xgboost.XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 100, seed = 123)
xgb.fit(x_train, y_train)
xgb.score(x_test, y_test)
predictions = xgb.predict(x_test)
print(r2_score(y_test, predictions))
pkl.dump(xgb, open('drive/My Drive/ai-prac/xgb.pkl', 'wb'))

In [None]:
#XGB Graphing
predictions = xgb.predict(x_test)
plt.ylabel('Actual Stock Change Percent')
plt.title('Actual vs Predicted Values')
plt.xlabel('Predicted Stock Change Percent')
plt.scatter(100 * predictions, 100 * y_test)

In [None]:
plt.ylabel('Predicted Stock Change Percent')
plt.title('Positivity Score vs Predicted Stock Change Percent')
plt.xlabel('Positivity Score')
plt.scatter(x_test['positivity'], predictions * 100)

In [None]:
# average sentiment scores across the dataset
print((df['positivity']).mean())
print((df['negativity']).mean())
print((df['neutral'].mean()))