# **Data Collection**

## *Pull News about each Stock*

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import csv
from datetime import datetime

# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Define the start and end dates
#start_date = datetime(2021, 1, 1)
#end_date = datetime(2024, 4, 10)

with open('news_data.csv', 'w', newline='') as csvfile:
    # Create a CSV writer object
    csv_writer = csv.writer(csvfile)
    # Write the header row
    csv_writer.writerow(['Stock Name','Date','Time','News Headline'])

stocks = ['TSLA','AMD','NVDA','MU','PLTR','NIO','AAPL','MARA','WBA','CLSK','F','CCL','SOFI','T','BAC','GOOGL','VALE','GOLD','INTC','AMZN']
finviz_url = 'https://finviz.com/quote.ashx?t='

for stock in stocks:
    final_url = finviz_url + stock
    request = Request(url=final_url, headers={'user-agent': 'app'}) 
    response = urlopen(request)    
    html = BeautifulSoup(response, features="lxml")
    news_table = html.find('table',id='news-table')
    news_table_row = news_table.find_all('tr')
    for news in news_table_row:
        news_headline = news.a.get_text() 
        date_scrape = news.td.text.split()
        if len(date_scrape) == 1:
            time = date_scrape[0]
            # Replace "Today" with today's date
            news_date = today_date
        else:
            date = date_scrape[0]
            time = date_scrape[1]
            # Replace "Today" with today's date
            if date.lower() == "today":
                news_date = today_date
            else:
                # Convert the scraped date to a datetime object
                news_date = datetime.strptime(date, '%b-%d-%y').date()  # Extracting only the date portion
            # Check if the news date falls within the specified range
            
            with open('news_data.csv', 'a', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow([stock, news_date, time, news_headline])


## *Annotate sentiments of news headlines with a pretrained model*

In [54]:
df_news = pd.read_csv('news_data.csv',encoding='ISO-8859-1')

df_news

Unnamed: 0,Stock Name,Date,Time,News Headline
0,TSLA,2024-04-10,10:00AM,Cathie Wood Is Dumping Nvidia and Buying Tesla...
1,TSLA,2024-04-09,07:03PM,Analyst revises Tesla stock price target after...
2,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...
3,AMD,2024-04-09,11:29PM,Google unveils first Arm-based data center CPU...
4,AMD,2024-04-08,08:30PM,Better Artificial Intelligence (AI) Stock: Qua...
...,...,...,...,...
555,AMZN,2024-04-09,05:25PM,"Sephora, Ulta, and e.l.f. among the top beauty..."
556,AMZN,2024-04-08,07:03PM,Analyst unveils new Amazon price target as sto...
557,AMZN,2024-04-07,06:46PM,Here Are My 3 Top Tech Stocks to Buy Right Now
558,AMZN,2024-04-06,10:00AM,Best Stock to Buy Right Now: Amazon vs. Disney


In [55]:
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline('sentiment-analysis', model=finbert, tokenizer=tokenizer)

df_news['Sentiment'] = df_news['News Headline'].apply(lambda x: nlp(x)[0]['label'])


In [56]:
df_news

Unnamed: 0,Stock Name,Date,Time,News Headline,Sentiment
0,TSLA,2024-04-10,10:00AM,Cathie Wood Is Dumping Nvidia and Buying Tesla...,Negative
1,TSLA,2024-04-09,07:03PM,Analyst revises Tesla stock price target after...,Neutral
2,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...,Neutral
3,AMD,2024-04-09,11:29PM,Google unveils first Arm-based data center CPU...,Negative
4,AMD,2024-04-08,08:30PM,Better Artificial Intelligence (AI) Stock: Qua...,Positive
...,...,...,...,...,...
555,AMZN,2024-04-09,05:25PM,"Sephora, Ulta, and e.l.f. among the top beauty...",Neutral
556,AMZN,2024-04-08,07:03PM,Analyst unveils new Amazon price target as sto...,Neutral
557,AMZN,2024-04-07,06:46PM,Here Are My 3 Top Tech Stocks to Buy Right Now,Neutral
558,AMZN,2024-04-06,10:00AM,Best Stock to Buy Right Now: Amazon vs. Disney,Positive


In [57]:
df_news['Sentiment'].value_counts()

Sentiment
Neutral     317
Positive    168
Negative     75
Name: count, dtype: int64

### **Preproessing**

In [58]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Handle emojis and specific characters
    text = re.sub(r'[:;=x][\-o\*]?[\)\(\[\]dpo\@\>\<\}3]', '', text)  # Removes typical emojis/smiley faces
    # Handle hashtags and mentions
    text = re.sub(r'#[\w-]+', 'hashtag', text)  # Replace hashtags
    text = re.sub(r'@[\w-]+', 'mention', text)  # Replace mentions
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    # Rejoin tokens
    return ' '.join(tokens)

# Apply preprocessing
df_news['processed_text'] = df_news['News Headline'].apply(preprocess_text)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sachi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sachi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sachi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [59]:
df_news.head()

Unnamed: 0,Stock Name,Date,Time,News Headline,Sentiment,processed_text
0,TSLA,2024-04-10,10:00AM,Cathie Wood Is Dumping Nvidia and Buying Tesla...,Negative,cathie wood dumping nvidia buying tesla : maki...
1,TSLA,2024-04-09,07:03PM,Analyst revises Tesla stock price target after...,Neutral,analyst revise tesla stock price target robota...
2,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...,Neutral,tesla agrees settle lawsuit autopilot involvem...
3,AMD,2024-04-09,11:29PM,Google unveils first Arm-based data center CPU...,Negative,google unveils first arm-based data center cpu...
4,AMD,2024-04-08,08:30PM,Better Artificial Intelligence (AI) Stock: Qua...,Positive,better artificial intelligence ( ai ) stock : ...


## **Feature Extraction**

### Bag of Words

In [60]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # type: ignore


count_vectorizer = CountVectorizer(stop_words='english')
X_bow = count_vectorizer.fit_transform(df_news['processed_text'])
bow_features_df = pd.DataFrame(X_bow.toarray(), columns=count_vectorizer.get_feature_names_out())

bow_features_df.head()

Unnamed: 0,000,061,10,100,100m,10x,11,11th,12,13,...,yahoo,year,yield,york,youll,youngest,zdge,zedge,zeus,zoomed
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

In [61]:
# TF-IDF

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(df_news['processed_text'])
tfidf_features_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_features_df

Unnamed: 0,000,061,10,100,100m,10x,11,11th,12,13,...,yahoo,year,yield,york,youll,youngest,zdge,zedge,zeus,zoomed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### BERT

In [29]:
import torch
from transformers import BertTokenizer, BertModel

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Move model to the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()


def batch_encode(texts, tokenizer, max_length=512):
    return tokenizer.batch_encode_plus(
        texts,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

def get_bert_embeddings(texts, tokenizer, model, batch_size=16):
    # Split texts into batches
    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
    embeddings = []
    
    for batch in batches:
        encoded_inputs = batch_encode(batch, tokenizer)
        input_ids = encoded_inputs['input_ids'].to(device)
        attention_mask = encoded_inputs['attention_mask'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Extract the embeddings for the `[CLS]` token (first token) representing the whole sequence
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_embeddings)
    
    # Concatenate all batch embeddings
    return np.concatenate(embeddings)

# Example usage
texts = df_news['processed_text'].tolist()
embeddings = get_bert_embeddings(texts, tokenizer, model)

# Since embeddings is a numpy array, you can decide how to add it to your DataFrame. For instance:
# Assuming you want to keep just the first embedding dimension as an example
df_news['BERT_embedding'] = embeddings[:, 0].tolist()

# Display the DataFrame to verify
df_news.head()

Unnamed: 0,Stock Name,Date,Time,News Headline,Sentiment,processed_text,BERT_embedding
0,TSLA,2024-04-10,08:31AM,"Tesla strike in Sweden continues, union says, ...",Neutral,"tesla strike sweden continues , union say , co...",-0.360706
1,TSLA,2024-04-09,07:03PM,Analyst revises Tesla stock price target after...,Neutral,analyst revise tesla stock price target robota...,-0.192663
2,TSLA,2024-04-08,10:46PM,Tesla Agrees to Settle Lawsuit Over Autopilots...,Neutral,tesla agrees settle lawsuit autopilot involvem...,-0.460014
3,AMD,2024-04-09,11:29PM,Google unveils first Arm-based data center CPU...,Negative,google unveils first arm-based data center cpu...,-0.635717
4,AMD,2024-04-08,08:30PM,Better Artificial Intelligence (AI) Stock: Qua...,Positive,better artificial intelligence ( ai ) stock : ...,-0.637123


* We are taking BOW embeddings.

## **Model Selection and Training**

In [62]:
from sklearn.model_selection import train_test_split

# Assuming `df_news['sentiment']` is your target variable and is binary (0 for negative, 1 for positive sentiment)
# Convert sentiment to numeric if it's not already

df_news['sentiment_numeric'] = df_news['Sentiment'].map({'Negative': 0, 'Neutral': 1, 'Positive': 2})

y = df_news['sentiment_numeric'].values

X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)


In [63]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train with BoW features
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))


Training Accuracy: 0.8995535714285714
Testing Accuracy: 0.4732142857142857
Classification Report:
               precision    recall  f1-score   support

           0       0.11      0.18      0.14        11
           1       0.73      0.48      0.58        73
           2       0.35      0.57      0.43        28

    accuracy                           0.47       112
   macro avg       0.40      0.41      0.38       112
weighted avg       0.57      0.47      0.50       112



## **Deployment and Interface**

In [64]:
import tkinter as tk
from tkinter import ttk

# Function to classify and display the sentiment
def classify_headline():
    headline = entry.get()
    processed = preprocess_text(headline)
    vectorized = count_vectorizer.transform([processed])
    prediction = model.predict(vectorized)
    result_label.config(text = f"Sentiment: {'Positive' if prediction == 2 else 'Neutral' if prediction == 1 else 'Negative'}")

# GUI setup
root = tk.Tk()
root.title("Stock News Sentiment Analysis")

frame = ttk.Frame(root, padding="30")
frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))

entry = ttk.Entry(frame, width=50)
entry.grid(row=0, column=0, sticky=(tk.W, tk.E))

button = ttk.Button(frame, text="Classify", command=classify_headline)
button.grid(row=1, column=0, sticky=tk.W, pady=10)

result_label = ttk.Label(frame, text="Sentiment: ")
result_label.grid(row=2, column=0, sticky=tk.W)

root.mainloop()
