In [84]:
#scraping modules
from bs4 import BeautifulSoup
import requests

#nlp modules
import spacy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler


In [10]:
content = requests.get('https://finance.yahoo.com/')
content = content.text
soup = BeautifulSoup(content, 'html.parser')

In [16]:
#retrieve the information from "hero-headlines hero-latest-news yf-13r5oof" class of the div tag
headlines = soup.find_all('div', class_='hero-headlines hero-latest-news yf-13r5oof')
headlines = headlines[0].find_all('h3')
headlines = [headline.text for headline in headlines]
headlines[1:]

['Alphabet’s Revenue Boosted by Cloud Computing, Search Ads',
 "FTC orders 8 companies to provide information on 'surveillance pricing' practices",
 'Aussie Risks Bigger Losses Versus Yen as CPI Coincides With BOJ',
 'Canadian Wildfires Threaten Oil Output, Prompt Evacuations',
 'Tech Falls in Late Hours as Earnings Underwhelm: Markets Wrap',
 'Boeing resumes deliveries of 737 MAX airplanes to China',
 'Embraer bumps up estimate for global traffic growth in next two decades',
 'How a perfect storm sent church insurance rates skyrocketing']

In [104]:
preprocessed_headlines = []
nlp = spacy.load('en_core_web_sm')
for headline in headlines[1:]:
    doc = nlp(headline)
    preprocessed_headline = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    preprocessed_headlines.append(preprocessed_headline)

In [44]:
df = pd.read_csv('stock_data.csv')

In [23]:
df.head()


Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [45]:
df.Sentiment.value_counts()

Sentiment
 1    3685
-1    2106
Name: count, dtype: int64

In [46]:
#each sentiment should have at least 2106 samples with random sampling
min_len = 2106
negative_sentiment = df[df.Sentiment == -1]
positive_sentiment = (df[df.Sentiment == 1]).sample(min_len, random_state=42)

#combine the two dataframes
df = pd.concat([negative_sentiment, positive_sentiment])
#shuffle the dataframe
df = df.sample(frac=1, random_state=2022).reset_index(drop=True)

In [47]:
df.Sentiment.value_counts()

Sentiment
 1    2106
-1    2106
Name: count, dtype: int64

In [48]:
import tensorflow as tf
from openai import OpenAI

In [49]:
#creating embeddings of each headline
client = OpenAI(api_key="sk-None-Qjfap14kDqSsywo3VRJnT3BlbkFJBA9uwGJbjgAPRSulATXr")

def create_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

In [50]:
df.Text = df.Text.apply(lambda x: create_embedding(x))

In [101]:
#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.Text, df.Sentiment, test_size=0.2, random_state=2022)

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

#Multinomial Naive Bayes does not accept negative values so we need to use MinMaxScaler
scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train_2d)
scaled_X_test = scaler.transform(X_test_2d)

# #train the naive bayes model
nb_model= MultinomialNB()
nb_model.fit(scaled_X_train, y_train)

# #test the model
nb_model.score(scaled_X_test, y_test)

# #predict the sentiment of the datasets
y_pred = nb_model.predict(scaled_X_test)
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n          -1       0.76      0.70      0.73       427\n           1       0.72      0.77      0.75       416\n\n    accuracy                           0.74       843\n   macro avg       0.74      0.74      0.74       843\nweighted avg       0.74      0.74      0.74       843\n'

In [102]:
#train using the KNN model
knnModel = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knnModel.fit(X_train_2d, y_train)
y_pred_2 = knnModel.predict(X_test_2d)
knnModel.score(X_test_2d, y_test)
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

          -1       0.75      0.81      0.78       427
           1       0.79      0.73      0.76       416

    accuracy                           0.77       843
   macro avg       0.77      0.77      0.77       843
weighted avg       0.77      0.77      0.77       843



In [105]:
#predict the sentiment of the headlines
preprocessed_headlines = [create_embedding(headline) for headline in preprocessed_headlines]
preprocessed_headlines = np.stack(preprocessed_headlines)
preprocessed_headlines

array([[ 0.03979261, -0.02636505,  0.02917938, ...,  0.00992715,
        -0.02437682, -0.00392045],
       [-0.02019515, -0.02305547, -0.02824178, ..., -0.00375418,
        -0.01577107,  0.01867069],
       [-0.03196119, -0.00024379, -0.02240189, ..., -0.01452055,
        -0.01435348, -0.00330689],
       ...,
       [ 0.0017534 , -0.02372099,  0.03179681, ...,  0.02396017,
         0.02556407,  0.00353669],
       [-0.01323737, -0.02190926, -0.01689742, ..., -0.00143787,
        -0.00065477, -0.01119056],
       [ 0.01855049, -0.01775123, -0.02129632, ..., -0.00076421,
        -0.00555612, -0.00355798]])

In [112]:
#below is a dictionary that maps entity names to their respective sentiment
predictions_nb = nb_model.predict(preprocessed_headlines)
predictions_nb

array([1, 1, 1, 1, 1, 1, 1, 1])

In [113]:
predictions_knn = knnModel.predict(preprocessed_headlines)
predictions_knn

array([-1, -1, -1,  1,  1, -1,  1,  1])

In [114]:
#knn accuracy is higher than the naive bayes model
knn_res = {}
for i, headline in enumerate(headlines[1:]):
    doc = nlp(headline)
    for ent in doc.ents:
        knn_res[ent.text] = predictions_knn[i]
knn_res

{'Alphabet': -1,
 'Cloud Computing': -1,
 'FTC': -1,
 '8': -1,
 'Aussie Risks Bigger Losses Versus Yen': -1,
 'BOJ': -1,
 'Canadian': 1,
 'Evacuations': 1,
 'Tech Falls': 1,
 'Late Hours': 1,
 'Boeing': -1,
 '737': -1,
 'MAX': -1,
 'China': -1,
 'next two decades': 1}