# **Data Collection**

## *Pull News about each Stock*

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import csv
from datetime import datetime

# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Define the start and end dates
#start_date = datetime(2021, 1, 1)
#end_date = datetime(2024, 4, 10)

with open('news_data.csv', 'w', newline='') as csvfile:
    # Create a CSV writer object
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(['Stock Name','Date','Time','News Headline'])

stocks = ['TSLA','AMD','NVDA','MU','PLTR','NIO','AAPL','MARA','WBA','CLSK','F','CCL','SOFI','T','BAC','GOOGL','VALE','GOLD','INTC','AMZN']
finviz_url = 'https://finviz.com/quote.ashx?t='

for stock in stocks:
    final_url = finviz_url + stock
    request = Request(url=final_url, headers={'user-agent': 'app'}) 
    response = urlopen(request)    
    html = BeautifulSoup(response, features="lxml")
    news_table = html.find('table',id='news-table')
    news_table_row = news_table.find_all('tr')
    for news in news_table_row:
        news_headline = news.a.get_text() 
        date_scrape = news.td.text.split()
        if len(date_scrape) == 1:
            time = date_scrape[0]
            # Replace "Today" with today's date
            news_date = today_date
        else:
            date = date_scrape[0]
            time = date_scrape[1]
            # Replace "Today" with today's date
            if date.lower() == "today":
                news_date = today_date
            else:
                # Convert the scraped date to a datetime object
                news_date = datetime.strptime(date, '%b-%d-%y').date()  # Extracting only the date portion
            # Check if the news date falls within the specified range
            
            with open('news_data.csv', 'a', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow([stock, news_date, time, news_headline])


## *Annotate sentiments of news headlines with a pretrained model*

In [84]:
df_news = pd.read_csv('news_data.csv',encoding='ISO-8859-1')

df_news

Unnamed: 0,Stock Name,Date,Time,News Headline
0,TSLA,2024-04-09,10:03AM,Tesla settles lawsuit in fatal crash where dri...
1,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...
2,TSLA,2024-04-07,10:08PM,Tesla wants Apple's help proving a driver was ...
3,TSLA,2024-04-06,08:42PM,Wall Street Analysts Just Trimmed Price Target...
4,AMD,2024-04-09,09:51AM,How's AT&T Handling Things?
...,...,...,...,...
562,AMZN,2024-04-08,07:03PM,Analyst unveils new Amazon price target as sto...
563,AMZN,2024-04-07,06:46PM,Here Are My 3 Top Tech Stocks to Buy Right Now
564,AMZN,2024-04-06,10:00AM,Best Stock to Buy Right Now: Amazon vs. Disney
565,AMZN,2024-04-05,07:22PM,Amazon continues construction on Puget Sound t...


In [85]:
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline('sentiment-analysis', model=finbert, tokenizer=tokenizer)

df_news['Sentiment'] = df_news['News Headline'].apply(lambda x: nlp(x)[0]['label'])


In [86]:
df_news

Unnamed: 0,Stock Name,Date,Time,News Headline,Sentiment
0,TSLA,2024-04-09,10:03AM,Tesla settles lawsuit in fatal crash where dri...,Neutral
1,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...,Neutral
2,TSLA,2024-04-07,10:08PM,Tesla wants Apple's help proving a driver was ...,Neutral
3,TSLA,2024-04-06,08:42PM,Wall Street Analysts Just Trimmed Price Target...,Neutral
4,AMD,2024-04-09,09:51AM,How's AT&T Handling Things?,Neutral
...,...,...,...,...,...
562,AMZN,2024-04-08,07:03PM,Analyst unveils new Amazon price target as sto...,Neutral
563,AMZN,2024-04-07,06:46PM,Here Are My 3 Top Tech Stocks to Buy Right Now,Neutral
564,AMZN,2024-04-06,10:00AM,Best Stock to Buy Right Now: Amazon vs. Disney,Positive
565,AMZN,2024-04-05,07:22PM,Amazon continues construction on Puget Sound t...,Neutral


In [35]:
#grouped = df_news.groupby(['Date', 'Stock Name','Sentiment'])['News Headline'].apply(list).reset_index()
#
## Merge the grouped DataFrame with the original DataFrame
#merged_df = pd.merge(df_news, grouped, on=['Date', 'Stock Name','Sentiment'], suffixes=(',', '_grouped'))
#
#merged_df



In [8]:
#merged_df.drop(columns=['Time','News Headline,'], inplace=True)

In [None]:
#merged_df.head(20).sort_values(by='Date')

In [8]:
# Convert 'Date' column to datetime type
#merged_df['Date'] =  pd.to_datetime(merged_df['Date']).dt.date



In [87]:
df_news['Sentiment'].value_counts()

Sentiment
Neutral     326
Positive    168
Negative     73
Name: count, dtype: int64

### **Preproessing**

In [88]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Tokenization
# Tokenize the text

df_news['Tokenized_Text'] = df_news['News Headline'].apply(word_tokenize)

# Stop-word removal
# Remove stop words
stop_words = set(stopwords.words('english'))
df_news['Tokenized_Text'] = df_news['Tokenized_Text'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Stemming
# Perform stemming
stemmer = PorterStemmer()
df_news['Stemmed_Text'] = df_news['Tokenized_Text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
# Perform lemmatization
lemmatizer = WordNetLemmatizer()
df_news['Lemmatized_Text'] = df_news['Tokenized_Text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Lowercasing
# Convert text to lowercase

df_news['Lowercased_Text'] = df_news['News Headline'].str.lower()

print('Text preprocessing steps completed successfully.')

Text preprocessing steps completed successfully.


In [89]:
df_news.head()

Unnamed: 0,Stock Name,Date,Time,News Headline,Sentiment,Tokenized_Text,Stemmed_Text,Lemmatized_Text,Lowercased_Text
0,TSLA,2024-04-09,10:03AM,Tesla settles lawsuit in fatal crash where dri...,Neutral,"[Tesla, settles, lawsuit, fatal, crash, driver...","[tesla, settl, lawsuit, fatal, crash, driver, ...","[Tesla, settle, lawsuit, fatal, crash, driver,...",tesla settles lawsuit in fatal crash where dri...
1,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...,Neutral,"[Tesla, Agrees, Settle, Lawsuit, Autopilots, I...","[tesla, agre, settl, lawsuit, autopilot, invol...","[Tesla, Agrees, Settle, Lawsuit, Autopilots, I...",tesla agrees to settle lawsuit over autopilots...
2,TSLA,2024-04-07,10:08PM,Tesla wants Apple's help proving a driver was ...,Neutral,"[Tesla, wants, Apple, 's, help, proving, drive...","[tesla, want, appl, 's, help, prove, driver, p...","[Tesla, want, Apple, 's, help, proving, driver...",tesla wants apple's help proving a driver was ...
3,TSLA,2024-04-06,08:42PM,Wall Street Analysts Just Trimmed Price Target...,Neutral,"[Wall, Street, Analysts, Trimmed, Price, Targe...","[wall, street, analyst, trim, price, target, 1...","[Wall, Street, Analysts, Trimmed, Price, Targe...",wall street analysts just trimmed price target...
4,AMD,2024-04-09,09:51AM,How's AT&T Handling Things?,Neutral,"['s, &, Handling, Things, ?]","['s, &, handl, thing, ?]","['s, &, Handling, Things, ?]",how's at&t handling things?


## **Feature Extraction**

### Bag of Words

In [91]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # type: ignore


count_vectorizer = CountVectorizer(stop_words='english')
bow_features = count_vectorizer.fit_transform(df_news['Lowercased_Text'])
bow_features_df = pd.DataFrame(bow_features.toarray(), columns=count_vectorizer.get_feature_names_out())

bow_features_df.head()

Unnamed: 0,000,061,10,100,100m,10x,11,11th,12,13,...,year,years,yield,york,youll,youngest,zdge,zedge,zeus,zoomed
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


* There are so many columns, it is unlikely we will be able to use this as an analysis. Instead, we could have a look at words that appear frequently.

In [92]:

# Create a mask so we only get the terms that have a frequency greater than 5 
bow_frequent_words = list(bow_features_df.sum()[bow_features_df.sum() > 5].index)

bow_features_df[bow_frequent_words]

Unnamed: 0,000,10,11,12,13,14,15,20,2022,2023,...,vale,vs,walgreens,wall,watch,wba,webcast,week,world,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
564,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


* This approach causes a significant dimensionality problem - the more documents have the larger the vocabulary, and the longer the vectors.
* Additionally, the vectors would also contain many 0s, thereby resulting in a huge sparse feature matrix.
* If the model comes across a new word it has not seen yet it will probably end up ignoring this word

### TF-IDF

In [93]:
# TF-IDF

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(df_news['Lowercased_Text'])
tfidf_features_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_features_df

Unnamed: 0,000,061,10,100,100m,10x,11,11th,12,13,...,year,years,yield,york,youll,youngest,zdge,zedge,zeus,zoomed
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.308234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
563,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
564,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
565,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* As above with the Bag of Words, the data is high dimensional and any useful analysis would require selecting the columns with the highest TF-IDF.

### Word2Vec

### BERT

In [71]:
import torch
from transformers import BertTokenizer, BertModel

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


df_news['BERT_Embeddings'] = df_news['Lowercased_Text'].apply(lambda x: model(**tokenizer(x, return_tensors='pt')).last_hidden_state)

df_news.head()

Unnamed: 0,Stock Name,Date,Time,News Headline,Sentiment,Tokenized_Text,Stemmed_Text,Lemmatized_Text,Lowercased_Text,BERT_Embeddings
0,TSLA,2024-04-09,08:32AM,Elon Musk is trying highlight the value that r...,Neutral,"[Elon, Musk, trying, highlight, value, robotax...","[elon, musk, tri, highlight, valu, robotaxi, c...","[Elon, Musk, trying, highlight, value, robotax...",elon musk is trying highlight the value that r...,"[[[tensor(-0.4156, grad_fn=<UnbindBackward0>),..."
1,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...,Neutral,"[Tesla, Agrees, Settle, Lawsuit, Autopilots, I...","[tesla, agre, settl, lawsuit, autopilot, invol...","[Tesla, Agrees, Settle, Lawsuit, Autopilots, I...",tesla agrees to settle lawsuit over autopilots...,"[[[tensor(-0.4517, grad_fn=<UnbindBackward0>),..."
2,TSLA,2024-04-07,10:08PM,Tesla wants Apple's help proving a driver was ...,Neutral,"[Tesla, wants, Apple, 's, help, proving, drive...","[tesla, want, appl, 's, help, prove, driver, p...","[Tesla, want, Apple, 's, help, proving, driver...",tesla wants apple's help proving a driver was ...,"[[[tensor(-0.5963, grad_fn=<UnbindBackward0>),..."
3,TSLA,2024-04-06,08:42PM,Wall Street Analysts Just Trimmed Price Target...,Neutral,"[Wall, Street, Analysts, Trimmed, Price, Targe...","[wall, street, analyst, trim, price, target, 1...","[Wall, Street, Analysts, Trimmed, Price, Targe...",wall street analysts just trimmed price target...,"[[[tensor(-0.2533, grad_fn=<UnbindBackward0>),..."
4,AMD,2024-04-09,07:15AM,3 Stocks That Can Help You Get Richer in 2024,Positive,"[3, Stocks, Help, Get, Richer, 2024]","[3, stock, help, get, richer, 2024]","[3, Stocks, Help, Get, Richer, 2024]",3 stocks that can help you get richer in 2024,"[[[tensor(-0.2304, grad_fn=<UnbindBackward0>),..."
