# **Data Collection**

## *Pull News about each Stock*

In [218]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')

from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import csv
from datetime import datetime

# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Define the start and end dates
#start_date = datetime(2021, 1, 1)
#end_date = datetime(2024, 4, 10)

with open('news_data.csv', 'w', newline='') as csvfile:
    # Create a CSV writer object
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(['Stock Name','Date','Time','News Headline'])

stocks = ['sqqq','soxs','tsla','mdia','nkla','hubc','spcb','cadl','tqqq','spy']
finviz_url = 'https://finviz.com/quote.ashx?t='

for stock in stocks:
    final_url = finviz_url + stock
    request = Request(url=final_url, headers={'user-agent': 'app'}) 
    response = urlopen(request)    
    html = BeautifulSoup(response, features="lxml")
    news_table = html.find('table',id='news-table')
    news_table_row = news_table.find_all('tr')
    for news in news_table_row:
        news_headline = news.a.get_text() 
        date_scrape = news.td.text.split()
        if len(date_scrape) == 1:
            time = date_scrape[0]
            # Replace "Today" with today's date
            news_date = today_date
        else:
            date = date_scrape[0]
            time = date_scrape[1]
            # Replace "Today" with today's date
            if date.lower() == "today":
                news_date = today_date
            else:
                # Convert the scraped date to a datetime object
                news_date = datetime.strptime(date, '%b-%d-%y').date()  # Extracting only the date portion
            # Check if the news date falls within the specified range
            
            with open('news_data.csv', 'a', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow([stock, news_date, time, news_headline])


## *Pull Stock values of each company*

In [219]:
from datetime import date

stocks =['sqqq','soxs','tsla','mdia','nkla','hubc','spcb','cadl','tqqq','spy']
hists = {}

# Specify the start and end dates
start_date = datetime(2021, 1, 1)
end_date = datetime(2024, 4, 10) 

for s in stocks:
    tkr = yf.Ticker(s)
    history = tkr.history(start=start_date, end=end_date)
    # Convert the 'Date' column to only contain the date portion
    history.index = history.index.date
    hists[s] = history

# Concatenate all data frames in hists dictionary into a single data frame
df = pd.concat(hists.values(), keys=hists.keys())

# Reset index to have separate columns for the ticker symbol and the date
df.reset_index(inplace=True)
df.rename(columns={'level_0': 'Ticker'}, inplace=True)
df.rename(columns={'level_1': 'Date'}, inplace=True)

df.to_csv('stock_data.csv', index=False)


## *Annotate sentiments of news headlines with a pretrained model*

In [220]:
df_news = pd.read_csv('news_data.csv',encoding='ISO-8859-1')

df_news

Unnamed: 0,Stock Name,Date,Time,News Headline
0,sqqq,2023-08-05,09:08AM,The stock market's next challenge: August dold...
1,sqqq,2023-03-01,09:00AM,How to chart anchored VWAP multiple time frame...
2,sqqq,2023-02-09,02:45PM,SQQQ Inflows Soar as Investors Bet Against Tech
3,sqqq,2022-11-16,04:17PM,ETFs: Investors looking to bet for or against ...
4,sqqq,2022-09-19,11:36AM,Most Traded Leveraged ETFs for Q4 2022
...,...,...,...,...
616,spy,2023-12-29,06:30AM,Want to Get Rich? $500 a Month in This Fund Co...
617,spy,2023-12-28,04:03PM,"Money market fund cash inflows surge, recessio..."
618,spy,2023-12-26,09:00AM,SPY: The First ETF Hitting Half a Trillion in ...
619,spy,2023-12-24,07:25AM,"Got $1,000 to Invest in Stocks? Put It in This..."


## *Merge the News & Stock data*

In [221]:
df_news.set_index('Date', inplace=True)
df_news

Unnamed: 0_level_0,Stock Name,Time,News Headline
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-08-05,sqqq,09:08AM,The stock market's next challenge: August dold...
2023-03-01,sqqq,09:00AM,How to chart anchored VWAP multiple time frame...
2023-02-09,sqqq,02:45PM,SQQQ Inflows Soar as Investors Bet Against Tech
2022-11-16,sqqq,04:17PM,ETFs: Investors looking to bet for or against ...
2022-09-19,sqqq,11:36AM,Most Traded Leveraged ETFs for Q4 2022
...,...,...,...
2023-12-29,spy,06:30AM,Want to Get Rich? $500 a Month in This Fund Co...
2023-12-28,spy,04:03PM,"Money market fund cash inflows surge, recessio..."
2023-12-26,spy,09:00AM,SPY: The First ETF Hitting Half a Trillion in ...
2023-12-24,spy,07:25AM,"Got $1,000 to Invest in Stocks? Put It in This..."


In [222]:
df_news = df_news.sort_index()
df_news

Unnamed: 0_level_0,Stock Name,Time,News Headline
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-10-22,soxs,05:51PM,Leading Chip Sector Hitting Resistance
2015-11-09,soxs,02:58PM,Rate Hike Odds Shine A Light On Leveraged Gold...
2015-11-10,soxs,06:28AM,Digging Deep With Leveraged Gold Miner ETFs As...
2015-11-18,soxs,03:50PM,Gold Miners ETFs Are Getting Smoked
2015-11-21,soxs,02:02PM,Russia Rally Reveals Love For Leverage
...,...,...,...
2024-04-06,nkla,06:45AM,Massive News for Nikola Stock Investors
2024-04-06,tsla,08:42PM,Wall Street Analysts Just Trimmed Price Target...
2024-04-06,spy,05:40AM,This ETF is beating the S&P 500 and its compl...
2024-04-07,tsla,10:08PM,Tesla wants Apple's help proving a driver was ...


In [223]:
grouped = df_news.groupby(['Date', 'Stock Name'])['News Headline'].apply(list).reset_index()

# Merge the grouped DataFrame with the original DataFrame
merged_df = pd.merge(df_news, grouped, on=['Date', 'Stock Name'], suffixes=(',', '_grouped'))
merged_df

Unnamed: 0,Date,Stock Name,Time,"News Headline,",News Headline_grouped
0,2015-10-22,soxs,05:51PM,Leading Chip Sector Hitting Resistance,[Leading Chip Sector Hitting Resistance]
1,2015-11-09,soxs,02:58PM,Rate Hike Odds Shine A Light On Leveraged Gold...,[Rate Hike Odds Shine A Light On Leveraged Gol...
2,2015-11-10,soxs,06:28AM,Digging Deep With Leveraged Gold Miner ETFs As...,[Digging Deep With Leveraged Gold Miner ETFs A...
3,2015-11-18,soxs,03:50PM,Gold Miners ETFs Are Getting Smoked,[Gold Miners ETFs Are Getting Smoked]
4,2015-11-21,soxs,02:02PM,Russia Rally Reveals Love For Leverage,[Russia Rally Reveals Love For Leverage]
...,...,...,...,...,...
616,2024-04-06,nkla,06:45AM,Massive News for Nikola Stock Investors,[Massive News for Nikola Stock Investors]
617,2024-04-06,tsla,08:42PM,Wall Street Analysts Just Trimmed Price Target...,[Wall Street Analysts Just Trimmed Price Targe...
618,2024-04-06,spy,05:40AM,This ETF is beating the S&P 500 and its compl...,[This ETF is beating the S&P 500 and its comp...
619,2024-04-07,tsla,10:08PM,Tesla wants Apple's help proving a driver was ...,[Tesla wants Apple's help proving a driver was...


In [224]:
merged_df.drop(columns=['Time','News Headline,'], inplace=True)

In [225]:
merged_df.head(20).sort_values(by='Date')

Unnamed: 0,Date,Stock Name,News Headline_grouped
0,2015-10-22,soxs,[Leading Chip Sector Hitting Resistance]
1,2015-11-09,soxs,[Rate Hike Odds Shine A Light On Leveraged Gol...
2,2015-11-10,soxs,[Digging Deep With Leveraged Gold Miner ETFs A...
3,2015-11-18,soxs,[Gold Miners ETFs Are Getting Smoked]
4,2015-11-21,soxs,[Russia Rally Reveals Love For Leverage]
5,2015-11-23,soxs,[The One Retail ETF That Isn't Disappointing]
6,2015-11-30,soxs,[Semis Breaking Through Resistance?]
7,2015-12-03,soxs,[A New Bearish Biotech ETF And A Lazarus Act F...
8,2015-12-11,soxs,[Using Leveraged Energy ETFs As Rising Rates P...
9,2015-12-22,soxs,[Russia ETFs On The Rocks...Except For One]


In [226]:
# Convert 'Date' column to datetime type
merged_df['Date'] =  pd.to_datetime(merged_df['Date']).dt.date



In [227]:
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline('sentiment-analysis', model=finbert, tokenizer=tokenizer)

merged_df['Sentiment'] = merged_df['News Headline_grouped'].apply(lambda x: nlp(x)[0]['label'])

merged_df

Unnamed: 0,Date,Stock Name,News Headline_grouped,Sentiment
0,2015-10-22,soxs,[Leading Chip Sector Hitting Resistance],Positive
1,2015-11-09,soxs,[Rate Hike Odds Shine A Light On Leveraged Gol...,Positive
2,2015-11-10,soxs,[Digging Deep With Leveraged Gold Miner ETFs A...,Negative
3,2015-11-18,soxs,[Gold Miners ETFs Are Getting Smoked],Negative
4,2015-11-21,soxs,[Russia Rally Reveals Love For Leverage],Positive
...,...,...,...,...
616,2024-04-06,nkla,[Massive News for Nikola Stock Investors],Neutral
617,2024-04-06,tsla,[Wall Street Analysts Just Trimmed Price Targe...,Neutral
618,2024-04-06,spy,[This ETF is beating the S&P 500 and its comp...,Neutral
619,2024-04-07,tsla,[Tesla wants Apple's help proving a driver was...,Neutral


In [228]:
merged_df['Sentiment'].value_counts()

Sentiment
Neutral     427
Positive    134
Negative     60
Name: count, dtype: int64

In [229]:
df_stock = pd.read_csv('stock_data.csv')

df_stock['Date'] = pd.to_datetime(df_stock['Date']).dt.date

In [230]:
df_stock.head(20).sort_values(by='Date')

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains
0,sqqq,2021-01-04,69.503035,76.221662,69.410366,73.395203,17903060,0.0,0.0,0.0
1,sqqq,2021-01-05,74.136564,74.136564,71.49545,71.58812,10119320,0.0,0.0,0.0
2,sqqq,2021-01-06,74.877942,75.665642,71.541791,74.553589,20570880,0.0,0.0,0.0
3,sqqq,2021-01-07,72.561167,72.561167,68.529986,68.99334,12022620,0.0,0.0,0.0
4,sqqq,2021-01-08,67.603284,69.086022,66.213224,66.444901,17438240,0.0,0.0,0.0
5,sqqq,2021-01-11,68.344643,69.734703,67.556943,69.317688,13487820,0.0,0.0,0.0
6,sqqq,2021-01-12,69.271357,71.449117,68.529989,69.595703,11161520,0.0,0.0,0.0
7,sqqq,2021-01-13,69.503039,69.873723,67.695963,68.298317,8393100,0.0,0.0,0.0
8,sqqq,2021-01-14,67.927627,69.642034,67.278928,69.317688,7090560,0.0,0.0,0.0
9,sqqq,2021-01-15,69.549364,71.819794,68.807996,70.985756,12317060,0.0,0.0,0.0


* I am doing right join to keep more data textual data.

In [231]:
# Merge srock sets on date & stock name

data = df_stock.merge(merged_df, how='inner', left_on=['Ticker','Date'], right_on=['Stock Name','Date'])
data

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,Stock Name,News Headline_grouped,Sentiment
0,sqqq,2021-05-14,57.316837,57.641186,54.722057,55.278084,17475000,0.0,0.0,0.0,sqqq,[Rise of online ordering will continue after C...,Neutral
1,sqqq,2021-05-27,51.385910,51.895598,51.015226,51.802929,8863020,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Bruce Greenwald],Neutral
2,sqqq,2021-06-09,49.208148,49.903178,48.883802,49.810509,8356560,0.0,0.0,0.0,sqqq,[Florida Gov. DeSantis will lose in Supreme Co...,Neutral
3,sqqq,2021-06-10,49.671499,49.903176,48.188769,48.235104,19425620,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Alphonso David],Neutral
4,sqqq,2021-06-11,48.235099,48.559448,47.864418,47.864418,7880380,0.0,0.0,0.0,sqqq,[HRC's Alphonso David has a message for those ...,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,spy,2024-03-28,523.210022,524.609985,522.780029,523.070007,96294900,0.0,0.0,0.0,spy,[Missed Out on the Bull Market Recovery? 3 ETF...,Neutral
426,spy,2024-04-02,518.239990,518.979980,516.479980,518.840027,74230300,0.0,0.0,0.0,spy,[IVV's $3.4B Tops Inflows; SPY Leads Outflows ...,Positive
427,spy,2024-04-03,517.719971,520.950012,517.669983,519.409973,59155800,0.0,0.0,0.0,spy,[Heres Whats Happening in Markets Today: April...,Neutral
428,spy,2024-04-04,523.520020,523.869995,512.760010,513.070007,96858100,0.0,0.0,0.0,spy,[Why the Fed is wading into uncharted waters: ...,Neutral


In [232]:
# make ticker and stock name the same

data.drop(columns=['Ticker'], inplace=True)

data.rename(columns={'Stock Name': 'Ticker'}, inplace=True)

data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,Ticker,News Headline_grouped,Sentiment
0,2021-05-14,57.316837,57.641186,54.722057,55.278084,17475000,0.0,0.0,0.0,sqqq,[Rise of online ordering will continue after C...,Neutral
1,2021-05-27,51.385910,51.895598,51.015226,51.802929,8863020,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Bruce Greenwald],Neutral
2,2021-06-09,49.208148,49.903178,48.883802,49.810509,8356560,0.0,0.0,0.0,sqqq,[Florida Gov. DeSantis will lose in Supreme Co...,Neutral
3,2021-06-10,49.671499,49.903176,48.188769,48.235104,19425620,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Alphonso David],Neutral
4,2021-06-11,48.235099,48.559448,47.864418,47.864418,7880380,0.0,0.0,0.0,sqqq,[HRC's Alphonso David has a message for those ...,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...
425,2024-03-28,523.210022,524.609985,522.780029,523.070007,96294900,0.0,0.0,0.0,spy,[Missed Out on the Bull Market Recovery? 3 ETF...,Neutral
426,2024-04-02,518.239990,518.979980,516.479980,518.840027,74230300,0.0,0.0,0.0,spy,[IVV's $3.4B Tops Inflows; SPY Leads Outflows ...,Positive
427,2024-04-03,517.719971,520.950012,517.669983,519.409973,59155800,0.0,0.0,0.0,spy,[Heres Whats Happening in Markets Today: April...,Neutral
428,2024-04-04,523.520020,523.869995,512.760010,513.070007,96858100,0.0,0.0,0.0,spy,[Why the Fed is wading into uncharted waters: ...,Neutral


### **Preproessing**

In [239]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

data['lower_case'] = data['News Headline_grouped'].apply(lambda x: [word.lower() for word in x])

# Removing Punctuation
data['punc'] = data['lower_case'].apply(lambda x: [word for word in x if word.isalnum()])


# Print the processed DataFrame
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,Ticker,News Headline_grouped,Sentiment,lower_case,punc
0,2021-05-14,57.316837,57.641186,54.722057,55.278084,17475000,0.0,0.0,0.0,sqqq,[Rise of online ordering will continue after C...,Neutral,[rise of online ordering will continue after c...,[]
1,2021-05-27,51.385910,51.895598,51.015226,51.802929,8863020,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Bruce Greenwald],Neutral,[influencers with andy serwer: bruce greenwald],[]
2,2021-06-09,49.208148,49.903178,48.883802,49.810509,8356560,0.0,0.0,0.0,sqqq,[Florida Gov. DeSantis will lose in Supreme Co...,Neutral,[florida gov. desantis will lose in supreme co...,[]
3,2021-06-10,49.671499,49.903176,48.188769,48.235104,19425620,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Alphonso David],Neutral,[influencers with andy serwer: alphonso david],[]
4,2021-06-11,48.235099,48.559448,47.864418,47.864418,7880380,0.0,0.0,0.0,sqqq,[HRC's Alphonso David has a message for those ...,Neutral,[hrc's alphonso david has a message for those ...,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,2024-03-28,523.210022,524.609985,522.780029,523.070007,96294900,0.0,0.0,0.0,spy,[Missed Out on the Bull Market Recovery? 3 ETF...,Neutral,[missed out on the bull market recovery? 3 etf...,[]
426,2024-04-02,518.239990,518.979980,516.479980,518.840027,74230300,0.0,0.0,0.0,spy,[IVV's $3.4B Tops Inflows; SPY Leads Outflows ...,Positive,[ivv's $3.4b tops inflows; spy leads outflows ...,[]
427,2024-04-03,517.719971,520.950012,517.669983,519.409973,59155800,0.0,0.0,0.0,spy,[Heres Whats Happening in Markets Today: April...,Neutral,[heres whats happening in markets today: april...,[]
428,2024-04-04,523.520020,523.869995,512.760010,513.070007,96858100,0.0,0.0,0.0,spy,[Why the Fed is wading into uncharted waters: ...,Neutral,[why the fed is wading into uncharted waters: ...,[]


In [242]:
# Removing Stopwords
stop_words = set(stopwords.words('english'))
data['stopwords'] = data['lower_case'].apply(lambda x: [word for word in x if word not in stop_words])


# Stemming
stemmer = PorterStemmer()
data['stemmed_data'] = data['stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
data['lemmatized_data'] = data['stemmed_data'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


data


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,Ticker,News Headline_grouped,Sentiment,lower_case,punc,stopwords,stemmed_data,lemmatized_data
0,2021-05-14,57.316837,57.641186,54.722057,55.278084,17475000,0.0,0.0,0.0,sqqq,[Rise of online ordering will continue after C...,Neutral,[rise of online ordering will continue after c...,[],[rise of online ordering will continue after c...,[rise of online ordering will continue after c...,[rise of online ordering will continue after c...
1,2021-05-27,51.385910,51.895598,51.015226,51.802929,8863020,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Bruce Greenwald],Neutral,[influencers with andy serwer: bruce greenwald],[],[influencers with andy serwer: bruce greenwald],[influencers with andy serwer: bruce greenwald],[influencers with andy serwer: bruce greenwald]
2,2021-06-09,49.208148,49.903178,48.883802,49.810509,8356560,0.0,0.0,0.0,sqqq,[Florida Gov. DeSantis will lose in Supreme Co...,Neutral,[florida gov. desantis will lose in supreme co...,[],[florida gov. desantis will lose in supreme co...,[florida gov. desantis will lose in supreme co...,[florida gov. desantis will lose in supreme co...
3,2021-06-10,49.671499,49.903176,48.188769,48.235104,19425620,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Alphonso David],Neutral,[influencers with andy serwer: alphonso david],[],[influencers with andy serwer: alphonso david],[influencers with andy serwer: alphonso david],[influencers with andy serwer: alphonso david]
4,2021-06-11,48.235099,48.559448,47.864418,47.864418,7880380,0.0,0.0,0.0,sqqq,[HRC's Alphonso David has a message for those ...,Neutral,[hrc's alphonso david has a message for those ...,[],[hrc's alphonso david has a message for those ...,[hrc's alphonso david has a message for those ...,[hrc's alphonso david has a message for those ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,2024-03-28,523.210022,524.609985,522.780029,523.070007,96294900,0.0,0.0,0.0,spy,[Missed Out on the Bull Market Recovery? 3 ETF...,Neutral,[missed out on the bull market recovery? 3 etf...,[],[missed out on the bull market recovery? 3 etf...,[missed out on the bull market recovery? 3 etf...,[missed out on the bull market recovery? 3 etf...
426,2024-04-02,518.239990,518.979980,516.479980,518.840027,74230300,0.0,0.0,0.0,spy,[IVV's $3.4B Tops Inflows; SPY Leads Outflows ...,Positive,[ivv's $3.4b tops inflows; spy leads outflows ...,[],[ivv's $3.4b tops inflows; spy leads outflows ...,[ivv's $3.4b tops inflows; spy leads outflows ...,[ivv's $3.4b tops inflows; spy leads outflows ...
427,2024-04-03,517.719971,520.950012,517.669983,519.409973,59155800,0.0,0.0,0.0,spy,[Heres Whats Happening in Markets Today: April...,Neutral,[heres whats happening in markets today: april...,[],[heres whats happening in markets today: april...,[heres whats happening in markets today: april...,[heres whats happening in markets today: april...
428,2024-04-04,523.520020,523.869995,512.760010,513.070007,96858100,0.0,0.0,0.0,spy,[Why the Fed is wading into uncharted waters: ...,Neutral,[why the fed is wading into uncharted waters: ...,[],[why the fed is wading into uncharted waters: ...,[why the fed is wading into uncharted waters: ...,[why the fed is wading into uncharted waters: ...
