In [51]:
#scraping modules
from bs4 import BeautifulSoup
import requests

#nlp modules
import spacy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
import joblib


In [36]:
content = requests.get('https://finance.yahoo.com/')
content = content.text
soup = BeautifulSoup(content, 'html.parser')

In [45]:
#retrieve the information from "hero-headlines hero-latest-news yf-13r5oof" class of the div tag
headlines = soup.find_all('div', class_='hero-headlines hero-second-col yf-13r5oof')
headlines = headlines[0].find_all('h3')
headlines = [headline.text for headline in headlines]
headlines[1:]

['Tesla slides after earnings fall short',
 'Ford stock tumbles after big earnings miss',
 "Krugman: Fed rate cut 'shouldn't matter much' in election",
 "'Fear gauge' hits 3-month high as stocks sell off",
 "Warner Bros. loses 'critical' NBA rights in $77 billion deal",
 'One year after it disappeared, Overstock.com is coming back']

In [46]:
preprocessed_headlines = []
nlp = spacy.load('en_core_web_sm')
for headline in headlines[1:]:
    doc = nlp(headline)
    preprocessed_headline = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    preprocessed_headlines.append(preprocessed_headline)

In [7]:
df = pd.read_csv('stock_data.csv')

In [8]:
df.head()


Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [9]:
df.Sentiment.value_counts()

Sentiment
 1    3685
-1    2106
Name: count, dtype: int64

In [10]:
#each sentiment should have at least 2106 samples with random sampling
min_len = 2106
negative_sentiment = df[df.Sentiment == -1]
positive_sentiment = (df[df.Sentiment == 1]).sample(min_len, random_state=42)

#combine the two dataframes
df = pd.concat([negative_sentiment, positive_sentiment])
#shuffle the dataframe
df = df.sample(frac=1, random_state=2022).reset_index(drop=True)

In [11]:
df.Sentiment.value_counts()

Sentiment
 1    2106
-1    2106
Name: count, dtype: int64

In [12]:
import tensorflow as tf
from openai import OpenAI

In [13]:
#creating embeddings of each headline
client = OpenAI(api_key="sk-None-Qjfap14kDqSsywo3VRJnT3BlbkFJBA9uwGJbjgAPRSulATXr")

def create_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

In [14]:
df.Text = df.Text.apply(lambda x: create_embedding(x))

In [21]:
#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.Text, df.Sentiment, test_size=0.2, random_state=2022)

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

#Multinomial Naive Bayes does not accept negative values so we need to use MinMaxScaler
scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train_2d)
scaled_X_test = scaler.transform(X_test_2d)

# #train the naive bayes model
nb_model= MultinomialNB()
nb_model.fit(scaled_X_train, y_train)

# #test the model
nb_model.score(scaled_X_test, y_test)

# #predict the sentiment of the datasets
y_pred = nb_model.predict(scaled_X_test)
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n          -1       0.76      0.70      0.73       427\n           1       0.72      0.77      0.75       416\n\n    accuracy                           0.74       843\n   macro avg       0.74      0.74      0.74       843\nweighted avg       0.74      0.74      0.74       843\n'

In [52]:
#train using the KNN model
knnModel = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knnModel.fit(X_train_2d, y_train)
y_pred_2 = knnModel.predict(X_test_2d)
knnModel.score(X_test_2d, y_test)
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

          -1       0.75      0.81      0.78       427
           1       0.79      0.73      0.76       416

    accuracy                           0.77       843
   macro avg       0.77      0.77      0.77       843
weighted avg       0.77      0.77      0.77       843



In [55]:
#save the model
joblib.dump(nb_model, 'nb_model.pkl')
joblib.dump(knnModel, 'knn_model.pkl')

['nb_model.pkl']

In [54]:
#predict the sentiment of the headlines
preprocessed_headlines = [create_embedding(headline) for headline in preprocessed_headlines]
preprocessed_headlines = np.stack(preprocessed_headlines)
preprocessed_headlines

array([[ 0.00072856, -0.06871434, -0.04320753, ...,  0.03365227,
        -0.00196457,  0.01583406],
       [-0.0265568 , -0.05101701,  0.02133052, ...,  0.02090512,
        -0.00299296,  0.02222688],
       [-0.04785164,  0.0178241 ,  0.0280527 , ..., -0.00452249,
         0.01070332, -0.01771017],
       [-0.01129631, -0.01783906, -0.00475356, ...,  0.00327633,
         0.02389986, -0.00778396],
       [-0.01143675,  0.0047288 ,  0.00331243, ..., -0.00981264,
        -0.01178423,  0.01755549],
       [-0.01705313, -0.03579557,  0.00813337, ..., -0.02260159,
        -0.00324388,  0.02308217]])

In [58]:
#below is a dictionary that maps entity names to their respective sentiment
loaded_nb_model = joblib.load('nb_model.pkl')
predictions_nb = loaded_nb_model.predict(preprocessed_headlines)
predictions_nb

array([ 1,  1, -1, -1,  1,  1])

In [59]:
loaded_knn_model = joblib.load('knn_model.pkl')
predictions_knn = loaded_nb_model.predict(preprocessed_headlines)
predictions_knn

array([ 1,  1, -1, -1,  1,  1])

In [60]:
#knn accuracy is higher than the naive bayes model
knn_res = {}
for i, headline in enumerate(headlines[1:]):
    doc = nlp(headline)
    for ent in doc.ents:
        knn_res[ent.text] = predictions_knn[i]
knn_res

{'Tesla': 1,
 'Ford': 1,
 'Krugman': -1,
 'Fed': -1,
 '3-month': -1,
 'Warner Bros.': 1,
 'NBA': 1,
 '$77 billion': 1,
 'One year': 1,
 'Overstock.com': 1}

In [72]:
#matching the return values with the actual companies
df_sp500 = pd.read_csv('sp500.csv')
df_sp500.head()
#using only Symbol and Security
df_sp500 = df_sp500[['Symbol', 'Security']]
df_sp500.head()
df_sp500.dropna(inplace=True)

In [73]:
#mapping the entities to the actual companies
new_knn_res = {}
for entity, sentiment in knn_res.items():
    for i, row in df_sp500.iterrows():
        if entity in row.Security:
            new_knn_res[row.Symbol] = sentiment

In [74]:
new_knn_res

{'TSLA': 1, 'F': 1, 'FRT': -1, 'FDX': -1, 'WBD': 1}