# **Data Collection**

## *Pull News about each Stock*

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import csv
from datetime import datetime

# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Define the start and end dates
#start_date = datetime(2021, 1, 1)
#end_date = datetime(2024, 4, 10)

with open('news_data.csv', 'w', newline='') as csvfile:
    # Create a CSV writer object
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(['Stock Name','Date','Time','News Headline'])

stocks = ['TSLA','AMD','NVDA','MU','PLTR','NIO','AAPL','MARA','WBA','CLSK','F','CCL','SOFI','T','BAC','GOOGL','VALE','GOLD','INTC','AMZN']
finviz_url = 'https://finviz.com/quote.ashx?t='

for stock in stocks:
    final_url = finviz_url + stock
    request = Request(url=final_url, headers={'user-agent': 'app'}) 
    response = urlopen(request)    
    html = BeautifulSoup(response, features="lxml")
    news_table = html.find('table',id='news-table')
    news_table_row = news_table.find_all('tr')
    for news in news_table_row:
        news_headline = news.a.get_text() 
        date_scrape = news.td.text.split()
        if len(date_scrape) == 1:
            time = date_scrape[0]
            # Replace "Today" with today's date
            news_date = today_date
        else:
            date = date_scrape[0]
            time = date_scrape[1]
            # Replace "Today" with today's date
            if date.lower() == "today":
                news_date = today_date
            else:
                # Convert the scraped date to a datetime object
                news_date = datetime.strptime(date, '%b-%d-%y').date()  # Extracting only the date portion
            # Check if the news date falls within the specified range
            
            with open('news_data.csv', 'a', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow([stock, news_date, time, news_headline])


## *Annotate sentiments of news headlines with a pretrained model*

In [20]:
df_news = pd.read_csv('news_data.csv',encoding='ISO-8859-1')

df_news

Unnamed: 0,Stock Name,Date,Time,News Headline
0,TSLA,2024-04-09,08:32AM,Elon Musk is trying highlight the value that r...
1,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...
2,TSLA,2024-04-07,10:08PM,Tesla wants Apple's help proving a driver was ...
3,TSLA,2024-04-06,08:42PM,Wall Street Analysts Just Trimmed Price Target...
4,AMD,2024-04-09,07:15AM,3 Stocks That Can Help You Get Richer in 2024
...,...,...,...,...
561,AMZN,2024-04-08,07:03PM,Analyst unveils new Amazon price target as sto...
562,AMZN,2024-04-07,06:46PM,Here Are My 3 Top Tech Stocks to Buy Right Now
563,AMZN,2024-04-06,10:00AM,Best Stock to Buy Right Now: Amazon vs. Disney
564,AMZN,2024-04-05,07:22PM,Amazon continues construction on Puget Sound t...


In [21]:
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline('sentiment-analysis', model=finbert, tokenizer=tokenizer)

df_news['Sentiment'] = df_news['News Headline'].apply(lambda x: nlp(x)[0]['label'])


In [22]:
df_news

Unnamed: 0,Stock Name,Date,Time,News Headline,Sentiment
0,TSLA,2024-04-09,08:32AM,Elon Musk is trying highlight the value that r...,Neutral
1,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...,Neutral
2,TSLA,2024-04-07,10:08PM,Tesla wants Apple's help proving a driver was ...,Neutral
3,TSLA,2024-04-06,08:42PM,Wall Street Analysts Just Trimmed Price Target...,Neutral
4,AMD,2024-04-09,07:15AM,3 Stocks That Can Help You Get Richer in 2024,Positive
...,...,...,...,...,...
561,AMZN,2024-04-08,07:03PM,Analyst unveils new Amazon price target as sto...,Neutral
562,AMZN,2024-04-07,06:46PM,Here Are My 3 Top Tech Stocks to Buy Right Now,Neutral
563,AMZN,2024-04-06,10:00AM,Best Stock to Buy Right Now: Amazon vs. Disney,Positive
564,AMZN,2024-04-05,07:22PM,Amazon continues construction on Puget Sound t...,Neutral


In [27]:
grouped = df_news.groupby(['Date', 'Stock Name','Sentiment'])['News Headline'].apply(list).reset_index()

# Merge the grouped DataFrame with the original DataFrame
merged_df = pd.merge(df_news, grouped, on=['Date', 'Stock Name'], suffixes=('', '_grouped'))


# find the duplicate rows



Unnamed: 0,Date,Stock Name,Sentiment,News Headline
0,2022-07-06,CLSK,Neutral,[CleanSpark stock pulls back after reporting J...
1,2022-07-14,CLSK,Neutral,"[CleanSpark buys 1,061 bitcoin miners at a 'su..."
2,2022-07-28,CLSK,Neutral,[Want to Ride the Bitcoin Rally? Here Are 2 Bi...
3,2022-07-29,CLSK,Neutral,[CleanSpark Executives to Discuss Fiscal Third...
4,2022-08-03,CLSK,Positive,[CleanSpark Announces July 2022 Bitcoin Mining...
...,...,...,...,...
561,2024-04-09,NVDA,Neutral,[Schwab Investors Scoop Up This Secret AI Gem ...
562,2024-04-09,PLTR,Positive,[Buy Alert: Cloud Deal Signals Clear Skies Ahe...
563,2024-04-09,SOFI,Positive,[Is SoFi Stock a Buy?]
564,2024-04-09,T,Positive,[2 Ultra-High-Yield Dividend Stocks to Buy and...


In [8]:
#merged_df.drop(columns=['Time','News Headline,'], inplace=True)

In [None]:
#merged_df.head(20).sort_values(by='Date')

In [8]:
# Convert 'Date' column to datetime type
#merged_df['Date'] =  pd.to_datetime(merged_df['Date']).dt.date



In [24]:
df_news['Sentiment'].value_counts()

Sentiment
Neutral     321
Positive    172
Negative     73
Name: count, dtype: int64

### **Preproessing**

In [239]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Tokenization
merged_df['News Headline'] = merged['News Headline'].apply(lambda x: word_tokenize(x))

# Stop-word removal
stop_words = set(stopwords.words('english'))
merged_df['News Headline'] = df['News Headline'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Stemming
porter = PorterStemmer()
merged_df['News Headline'] = df['News Headline'].apply(lambda x: [porter.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
merged_df['News Headline'] = df['News Headline'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Lowercasing
# Lowercasing is usually done before other preprocessing steps, but we can do it here as well
# Convert all words to lowercase
# df['News Headline'] = df['News Headline'].apply(lambda x: [word.lower() for word in x])


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,Ticker,News Headline_grouped,Sentiment,lower_case,punc
0,2021-05-14,57.316837,57.641186,54.722057,55.278084,17475000,0.0,0.0,0.0,sqqq,[Rise of online ordering will continue after C...,Neutral,[rise of online ordering will continue after c...,[]
1,2021-05-27,51.385910,51.895598,51.015226,51.802929,8863020,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Bruce Greenwald],Neutral,[influencers with andy serwer: bruce greenwald],[]
2,2021-06-09,49.208148,49.903178,48.883802,49.810509,8356560,0.0,0.0,0.0,sqqq,[Florida Gov. DeSantis will lose in Supreme Co...,Neutral,[florida gov. desantis will lose in supreme co...,[]
3,2021-06-10,49.671499,49.903176,48.188769,48.235104,19425620,0.0,0.0,0.0,sqqq,[Influencers with Andy Serwer: Alphonso David],Neutral,[influencers with andy serwer: alphonso david],[]
4,2021-06-11,48.235099,48.559448,47.864418,47.864418,7880380,0.0,0.0,0.0,sqqq,[HRC's Alphonso David has a message for those ...,Neutral,[hrc's alphonso david has a message for those ...,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,2024-03-28,523.210022,524.609985,522.780029,523.070007,96294900,0.0,0.0,0.0,spy,[Missed Out on the Bull Market Recovery? 3 ETF...,Neutral,[missed out on the bull market recovery? 3 etf...,[]
426,2024-04-02,518.239990,518.979980,516.479980,518.840027,74230300,0.0,0.0,0.0,spy,[IVV's $3.4B Tops Inflows; SPY Leads Outflows ...,Positive,[ivv's $3.4b tops inflows; spy leads outflows ...,[]
427,2024-04-03,517.719971,520.950012,517.669983,519.409973,59155800,0.0,0.0,0.0,spy,[Heres Whats Happening in Markets Today: April...,Neutral,[heres whats happening in markets today: april...,[]
428,2024-04-04,523.520020,523.869995,512.760010,513.070007,96858100,0.0,0.0,0.0,spy,[Why the Fed is wading into uncharted waters: ...,Neutral,[why the fed is wading into uncharted waters: ...,[]


In [None]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer


# Bag-of-Words representation
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(df['News Headline'].apply(lambda x: ' '.join(x)))

# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['News Headline'].apply(lambda x: ' '.join(x)))

# Word2Vec embedding
word2vec_model = Word2Vec(sentences=df['News Headline'], vector_size=100, window=5, min_count=1, workers=4)

# BERT embedding
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
bert_embeddings = bert_model.encode(df['News Headline'].apply(lambda x: ' '.join(x)))

print('Feature extraction techniques completed.')