# **Data Collection**

## *Pull News about each Stock*

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import csv
from datetime import datetime

# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Define the start and end dates
#start_date = datetime(2021, 1, 1)
#end_date = datetime(2024, 4, 10)

with open('news_data.csv', 'w', newline='') as csvfile:
    # Create a CSV writer object
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(['Stock Name','Date','Time','News Headline'])

stocks = ['TSLA','AMD','NVDA','MU','PLTR','NIO','AAPL','MARA','WBA','CLSK','F','CCL','SOFI','T','BAC','GOOGL','VALE','GOLD','INTC','AMZN','WBD','GOOG','PFE','SQQQ','SOXS','MDIA','NKLA','HUBC','SPCB','CADL','TQQQ','SQY','NURE','REZ','PICK','BLCN','SLX','LIT','FBZ','BLOK','FEMS','TUR','INDS','KCE','EWD','SPHB','SLYV','RNSC','PSR','FNDE','NFTY']
finviz_url = 'https://finviz.com/quote.ashx?t='

for stock in stocks:
    final_url = finviz_url + stock
    request = Request(url=final_url, headers={'user-agent': 'app'}) 
    response = urlopen(request)    
    html = BeautifulSoup(response, features="lxml")
    news_table = html.find('table',id='news-table')
    news_table_row = news_table.find_all('tr')
    for news in news_table_row:
        news_headline = news.a.get_text() 
        date_scrape = news.td.text.split()
        if len(date_scrape) == 1:
            time = date_scrape[0]
            # Replace "Today" with today's date
            news_date = today_date
        else:
            date = date_scrape[0]
            time = date_scrape[1]
            # Replace "Today" with today's date
            if date.lower() == "today":
                news_date = today_date
            else:
                # Convert the scraped date to a datetime object
                news_date = datetime.strptime(date, '%b-%d-%y').date()  # Extracting only the date portion
            # Check if the news date falls within the specified range
            
            with open('news_data.csv', 'a', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow([stock, news_date, time, news_headline])


## *Annotate sentiments of news headlines with a pretrained model*

In [2]:
df_news = pd.read_csv('news_data.csv',encoding='ISO-8859-1')

df_news

Unnamed: 0,Stock Name,Date,Time,News Headline
0,TSLA,2024-04-08,05:05PM,Get ready for Tesla's massive Supercharger site
1,TSLA,2024-04-07,10:08PM,Tesla wants Apple's help proving a driver was ...
2,TSLA,2024-04-06,08:42PM,Wall Street Analysts Just Trimmed Price Target...
3,TSLA,2024-04-05,09:57PM,Elon Musk Says Tesla Will Unveil Robotaxi in A...
4,AMD,2024-04-08,02:24PM,If You Can Only Buy One Machine Learning Stock...
...,...,...,...,...
2408,FNDE,2013-10-03,11:38AM,A Bright Future for Fundamental Indexing
2409,FNDE,2013-08-19,10:34AM,Schwab Enters Smart-Beta ETF Space
2410,FNDE,2013-08-15,07:00AM,"Schwab Adds 6 Fundamental ETFs: FNDB, FNDX, FN..."
2411,NFTY,2024-03-29,09:41AM,10 Best India ETFs For 2024


In [3]:
df_news.set_index('Date', inplace=True)
df_news

Unnamed: 0_level_0,Stock Name,Time,News Headline
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-04-08,TSLA,05:05PM,Get ready for Tesla's massive Supercharger site
2024-04-07,TSLA,10:08PM,Tesla wants Apple's help proving a driver was ...
2024-04-06,TSLA,08:42PM,Wall Street Analysts Just Trimmed Price Target...
2024-04-05,TSLA,09:57PM,Elon Musk Says Tesla Will Unveil Robotaxi in A...
2024-04-08,AMD,02:24PM,If You Can Only Buy One Machine Learning Stock...
...,...,...,...
2013-10-03,FNDE,11:38AM,A Bright Future for Fundamental Indexing
2013-08-19,FNDE,10:34AM,Schwab Enters Smart-Beta ETF Space
2013-08-15,FNDE,07:00AM,"Schwab Adds 6 Fundamental ETFs: FNDB, FNDX, FN..."
2024-03-29,NFTY,09:41AM,10 Best India ETFs For 2024


In [4]:
df_news = df_news.sort_index()
df_news

Unnamed: 0_level_0,Stock Name,Time,News Headline
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-04-07,PSR,10:22AM,Invesco PowerShares Lists Industry's First Sui...
2010-05-03,PSR,02:27PM,How Big Was Housing Price Cycle?
2010-06-03,PSR,11:27AM,Invesco PowerShares Lists International Corpor...
2010-06-21,PSR,03:21PM,Accelerating Jumbo Mortgage Delinquencies Will...
2010-06-23,PSR,03:35AM,Discouraging Housing Data: Does It Really Matter?
...,...,...,...
2024-04-08,GOOGL,04:19PM,What To Expect From Google's Cloud Next Confer...
2024-04-08,T,07:50AM,Is Verizon A Buy As 5G Network Build-Out Gains...
2024-04-08,MU,04:25PM,"Semiconductor Stocks Arm, TSMC Advance On Posi..."
2024-04-08,AAPL,01:30PM,Could Apple Help You Retire a Millionaire?


In [5]:
grouped = df_news.groupby(['Date', 'Stock Name'])['News Headline'].apply(list).reset_index()

# Merge the grouped DataFrame with the original DataFrame
merged_df = pd.merge(df_news, grouped, on=['Date', 'Stock Name'], suffixes=(',', '_grouped'))
merged_df

Unnamed: 0,Date,Stock Name,Time,"News Headline,",News Headline_grouped
0,2010-04-07,PSR,10:22AM,Invesco PowerShares Lists Industry's First Sui...,[Invesco PowerShares Lists Industry's First Su...
1,2010-05-03,PSR,02:27PM,How Big Was Housing Price Cycle?,[How Big Was Housing Price Cycle?]
2,2010-06-03,PSR,11:27AM,Invesco PowerShares Lists International Corpor...,[Invesco PowerShares Lists International Corpo...
3,2010-06-21,PSR,03:21PM,Accelerating Jumbo Mortgage Delinquencies Will...,[Accelerating Jumbo Mortgage Delinquencies Wil...
4,2010-06-23,PSR,03:35AM,Discouraging Housing Data: Does It Really Matter?,[Discouraging Housing Data: Does It Really Mat...
...,...,...,...,...,...
2408,2024-04-08,GOOGL,04:19PM,What To Expect From Google's Cloud Next Confer...,[What To Expect From Google's Cloud Next Confe...
2409,2024-04-08,T,07:50AM,Is Verizon A Buy As 5G Network Build-Out Gains...,[Is Verizon A Buy As 5G Network Build-Out Gain...
2410,2024-04-08,MU,04:25PM,"Semiconductor Stocks Arm, TSMC Advance On Posi...","[Semiconductor Stocks Arm, TSMC Advance On Pos..."
2411,2024-04-08,AAPL,01:30PM,Could Apple Help You Retire a Millionaire?,[Could Apple Help You Retire a Millionaire?]


In [6]:
merged_df.drop(columns=['Time','News Headline,'], inplace=True)

In [7]:
merged_df.head(20).sort_values(by='Date')

Unnamed: 0,Date,Stock Name,News Headline_grouped
0,2010-04-07,PSR,[Invesco PowerShares Lists Industry's First Su...
1,2010-05-03,PSR,[How Big Was Housing Price Cycle?]
2,2010-06-03,PSR,[Invesco PowerShares Lists International Corpo...
3,2010-06-21,PSR,[Accelerating Jumbo Mortgage Delinquencies Wil...
4,2010-06-23,PSR,[Discouraging Housing Data: Does It Really Mat...
5,2010-08-09,PSR,[FHA Insured Mortgages: A Disaster in the Maki...
6,2010-08-25,PSR,[An Active ETF for a Real Estate Minefield?]
7,2010-11-08,PSR,[Seven Successful Active ETFs]
8,2010-12-02,PSR,[Invesco PowerShares Lists Four Financial Sect...
9,2010-12-03,PSR,[Invesco PowerShares Provides Capital Gains In...


In [8]:
# Convert 'Date' column to datetime type
merged_df['Date'] =  pd.to_datetime(merged_df['Date']).dt.date



In [9]:
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline('sentiment-analysis', model=finbert, tokenizer=tokenizer)

merged_df['Sentiment'] = merged_df['News Headline_grouped'].apply(lambda x: nlp(x)[0]['label'])

merged_df

Unnamed: 0,Date,Stock Name,News Headline_grouped,Sentiment
0,2010-04-07,PSR,[Invesco PowerShares Lists Industry's First Su...,Neutral
1,2010-05-03,PSR,[How Big Was Housing Price Cycle?],Neutral
2,2010-06-03,PSR,[Invesco PowerShares Lists International Corpo...,Neutral
3,2010-06-21,PSR,[Accelerating Jumbo Mortgage Delinquencies Wil...,Negative
4,2010-06-23,PSR,[Discouraging Housing Data: Does It Really Mat...,Neutral
...,...,...,...,...
2408,2024-04-08,GOOGL,[What To Expect From Google's Cloud Next Confe...,Neutral
2409,2024-04-08,T,[Is Verizon A Buy As 5G Network Build-Out Gain...,Positive
2410,2024-04-08,MU,"[Semiconductor Stocks Arm, TSMC Advance On Pos...",Positive
2411,2024-04-08,AAPL,[Could Apple Help You Retire a Millionaire?],Neutral


In [10]:
merged_df['Sentiment'].value_counts()

Sentiment
Neutral     1454
Positive     627
Negative     332
Name: count, dtype: int64

* I am doing right join to keep more data textual data.

### **Preproessing**

In [239]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Tokenization
merged_df['News Headline'] = df['News Headline'].apply(lambda x: word_tokenize(x))

# Stop-word removal
stop_words = set(stopwords.words('english'))
merged_df['News Headline'] = df['News Headline'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Stemming
porter = PorterStemmer()
merged_df['News Headline'] = df['News Headline'].apply(lambda x: [porter.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
merged_df['News Headline'] = df['News Headline'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Lowercasing
# Lowercasing is usually done before other preprocessing steps, but we can do it here as well
# Convert all words to lowercase
# df['News Headline'] = df['News Headline'].apply(lambda x: [word.lower() for word in x])


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,Ticker,News Headline_grouped,Sentiment,lower_case,punc
0,2021-05-14,57.316837,57.641186,54.722057,55.278084,17475000,0.0,0.0,0.0,sqqq,[Rise of online ordering will continue after C...,Neutral,[rise of online ordering will continue after c...,[]
1,2021-05-27,51.385910,51.895598,51.015226,51.802929,8863020,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Bruce Greenwald],Neutral,[influencers with andy serwer: bruce greenwald],[]
2,2021-06-09,49.208148,49.903178,48.883802,49.810509,8356560,0.0,0.0,0.0,sqqq,[Florida Gov. DeSantis will lose in Supreme Co...,Neutral,[florida gov. desantis will lose in supreme co...,[]
3,2021-06-10,49.671499,49.903176,48.188769,48.235104,19425620,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Alphonso David],Neutral,[influencers with andy serwer: alphonso david],[]
4,2021-06-11,48.235099,48.559448,47.864418,47.864418,7880380,0.0,0.0,0.0,sqqq,[HRC's Alphonso David has a message for those ...,Neutral,[hrc's alphonso david has a message for those ...,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,2024-03-28,523.210022,524.609985,522.780029,523.070007,96294900,0.0,0.0,0.0,spy,[Missed Out on the Bull Market Recovery? 3 ETF...,Neutral,[missed out on the bull market recovery? 3 etf...,[]
426,2024-04-02,518.239990,518.979980,516.479980,518.840027,74230300,0.0,0.0,0.0,spy,[IVV's $3.4B Tops Inflows; SPY Leads Outflows ...,Positive,[ivv's $3.4b tops inflows; spy leads outflows ...,[]
427,2024-04-03,517.719971,520.950012,517.669983,519.409973,59155800,0.0,0.0,0.0,spy,[Heres Whats Happening in Markets Today: April...,Neutral,[heres whats happening in markets today: april...,[]
428,2024-04-04,523.520020,523.869995,512.760010,513.070007,96858100,0.0,0.0,0.0,spy,[Why the Fed is wading into uncharted waters: ...,Neutral,[why the fed is wading into uncharted waters: ...,[]


In [None]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer


# Bag-of-Words representation
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(df['News Headline'].apply(lambda x: ' '.join(x)))

# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['News Headline'].apply(lambda x: ' '.join(x)))

# Word2Vec embedding
word2vec_model = Word2Vec(sentences=df['News Headline'], vector_size=100, window=5, min_count=1, workers=4)

# BERT embedding
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
bert_embeddings = bert_model.encode(df['News Headline'].apply(lambda x: ' '.join(x)))

print('Feature extraction techniques completed.')