In [21]:
import pandas as pd
import numpy as np
import yfinance
import requests
import lxml
import pandas_datareader
from pandas_datareader import data as pdr
import yfinance as yf
from torch.nn.functional import softmax
from tqdm import tqdm
from utilities import *

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [23]:
# stocks where we will search news articles for
stocks = ['AAPL', 'MSFT','AMZN','GOOG', 'TSLA', 'NVDA', 'UNH', 'JNJ', 'PG', 'V', 'HD','BAC', 'XOM', 'MA', 'DIS', 'PFE', 'CVX', 'ABBV', 'KO', 'CSCO', 'AVGO', 'VZ', 'PEP', 'WMT', 'BRK', 'JPM', 'ADBE', 'INTC', 'AMD', 'QCOM', 'MCD',
          'NFLX', 'NKE', 'UPS', 'AMAT', 'AXP', 'GS']

Performing news sentiment analysis

In [24]:
big_df = pd.DataFrame()
# Read finances and news for all stocks in list and create a big dataframe.
for stock in stocks:
    try:
        news = read_news(stock)
        finances = financial_dataset(stock, num_of_labels=2)
        merged_df = merge_fin_news(finances, news)
        print(f"Performing sentiment analysis for stock {stock}")
        sentimentized_df = sentim_analyzer(merged_df, tokenizer, model)
        # average sentiment for each unique date.
        sentimentized_df = merge_dates(sentimentized_df)
        big_df = pd.concat([big_df, sentimentized_df], axis=0, ignore_index=True)
    except Exception as e:
        print(f"An error occurred while processing stock {stock}: {str(e)}")
    
big_df.to_csv('Datasets/sp500_news.csv')

The bot found 469 headlines from analyst_ratings_processed.csv, regarding AAPL stock
The bot found 32 headlines from raw_partner_headlines.csv, regarding AAPL stock
The bot found 501 headlines in total, regarding AAPL stock
[*********************100%***********************]  1 of 1 completed
AAPL financial dataframe dimensions  (2769, 6)
Positive changes : 1467
Negative changes : 1302
No changes : 0
Performing sentiment analysis for stock AAPL


100%|██████████| 462/462 [00:18<00:00, 24.49it/s]


 Dataframe now contains sentiment score for 65 different dates.
The bot found 0 headlines from analyst_ratings_processed.csv, regarding MSFT stock
The bot found 0 headlines from raw_partner_headlines.csv, regarding MSFT stock
The bot found 0 headlines in total, regarding MSFT stock
[*********************100%***********************]  1 of 1 completed
MSFT financial dataframe dimensions  (2769, 6)
Positive changes : 1446
Negative changes : 1323
No changes : 0
Performing sentiment analysis for stock MSFT


0it [00:00, ?it/s]

 Dataframe now contains sentiment score for 0 different dates.





The bot found 330 headlines from analyst_ratings_processed.csv, regarding AMZN stock
The bot found 0 headlines from raw_partner_headlines.csv, regarding AMZN stock
The bot found 330 headlines in total, regarding AMZN stock
[*********************100%***********************]  1 of 1 completed
AMZN financial dataframe dimensions  (2769, 6)
Positive changes : 1476
Negative changes : 1293
No changes : 0
Performing sentiment analysis for stock AMZN


100%|██████████| 318/318 [00:12<00:00, 24.66it/s]


 Dataframe now contains sentiment score for 32 different dates.
The bot found 1209 headlines from analyst_ratings_processed.csv, regarding GOOG stock
The bot found 5 headlines from raw_partner_headlines.csv, regarding GOOG stock
The bot found 1214 headlines in total, regarding GOOG stock
[*********************100%***********************]  1 of 1 completed
GOOG financial dataframe dimensions  (2769, 6)
Positive changes : 1457
Negative changes : 1312
No changes : 0
Performing sentiment analysis for stock GOOG


100%|██████████| 1184/1184 [00:48<00:00, 24.23it/s]


 Dataframe now contains sentiment score for 352 different dates.
The bot found 1930 headlines from analyst_ratings_processed.csv, regarding TSLA stock
The bot found 0 headlines from raw_partner_headlines.csv, regarding TSLA stock
The bot found 1930 headlines in total, regarding TSLA stock
[*********************100%***********************]  1 of 1 completed
TSLA financial dataframe dimensions  (2647, 6)
Positive changes : 1367
Negative changes : 1280
No changes : 0
Performing sentiment analysis for stock TSLA


100%|██████████| 1840/1840 [01:19<00:00, 23.26it/s]


 Dataframe now contains sentiment score for 235 different dates.
The bot found 3133 headlines from analyst_ratings_processed.csv, regarding NVDA stock
The bot found 0 headlines from raw_partner_headlines.csv, regarding NVDA stock
The bot found 3133 headlines in total, regarding NVDA stock
[*********************100%***********************]  1 of 1 completed
NVDA financial dataframe dimensions  (2769, 6)
Positive changes : 1454
Negative changes : 1315
No changes : 0
Performing sentiment analysis for stock NVDA


100%|██████████| 3057/3057 [02:13<00:00, 22.87it/s]


 Dataframe now contains sentiment score for 1122 different dates.
The bot found 230 headlines from analyst_ratings_processed.csv, regarding UNH stock
The bot found 170 headlines from raw_partner_headlines.csv, regarding UNH stock
The bot found 400 headlines in total, regarding UNH stock
[*********************100%***********************]  1 of 1 completed
UNH financial dataframe dimensions  (2769, 6)
Positive changes : 1464
Negative changes : 1305
No changes : 0
Performing sentiment analysis for stock UNH


100%|██████████| 389/389 [00:16<00:00, 23.56it/s]


 Dataframe now contains sentiment score for 126 different dates.
The bot found 2927 headlines from analyst_ratings_processed.csv, regarding JNJ stock
The bot found 0 headlines from raw_partner_headlines.csv, regarding JNJ stock
The bot found 2927 headlines in total, regarding JNJ stock
[*********************100%***********************]  1 of 1 completed
JNJ financial dataframe dimensions  (2769, 6)
Positive changes : 1430
Negative changes : 1339
No changes : 0
Performing sentiment analysis for stock JNJ


100%|██████████| 2800/2800 [02:00<00:00, 23.16it/s]


 Dataframe now contains sentiment score for 1252 different dates.
The bot found 1303 headlines from analyst_ratings_processed.csv, regarding PG stock
The bot found 7 headlines from raw_partner_headlines.csv, regarding PG stock
The bot found 1310 headlines in total, regarding PG stock
[*********************100%***********************]  1 of 1 completed
PG financial dataframe dimensions  (2769, 6)
Positive changes : 1426
Negative changes : 1343
No changes : 0
Performing sentiment analysis for stock PG


100%|██████████| 1252/1252 [00:53<00:00, 23.58it/s]


 Dataframe now contains sentiment score for 593 different dates.
The bot found 10 headlines from analyst_ratings_processed.csv, regarding V stock
The bot found 28 headlines from raw_partner_headlines.csv, regarding V stock
The bot found 38 headlines in total, regarding V stock
[*********************100%***********************]  1 of 1 completed
V financial dataframe dimensions  (2769, 6)
Positive changes : 1507
Negative changes : 1262
No changes : 0
Performing sentiment analysis for stock V


100%|██████████| 37/37 [00:01<00:00, 24.33it/s]


 Dataframe now contains sentiment score for 16 different dates.
The bot found 2617 headlines from analyst_ratings_processed.csv, regarding HD stock
The bot found 1841 headlines from raw_partner_headlines.csv, regarding HD stock
The bot found 4458 headlines in total, regarding HD stock
[*********************100%***********************]  1 of 1 completed
HD financial dataframe dimensions  (2769, 6)
Positive changes : 1479
Negative changes : 1290
No changes : 0
Performing sentiment analysis for stock HD


100%|██████████| 4116/4116 [02:37<00:00, 26.10it/s]


 Dataframe now contains sentiment score for 1339 different dates.
The bot found 1796 headlines from analyst_ratings_processed.csv, regarding BAC stock
The bot found 5 headlines from raw_partner_headlines.csv, regarding BAC stock
The bot found 1801 headlines in total, regarding BAC stock
[*********************100%***********************]  1 of 1 completed
BAC financial dataframe dimensions  (2769, 6)
Positive changes : 1392
Negative changes : 1377
No changes : 0
Performing sentiment analysis for stock BAC


100%|██████████| 1723/1723 [01:15<00:00, 22.93it/s]


 Dataframe now contains sentiment score for 830 different dates.
The bot found 593 headlines from analyst_ratings_processed.csv, regarding XOM stock
The bot found 5 headlines from raw_partner_headlines.csv, regarding XOM stock
The bot found 598 headlines in total, regarding XOM stock
[*********************100%***********************]  1 of 1 completed
XOM financial dataframe dimensions  (2769, 6)
Positive changes : 1358
Negative changes : 1411
No changes : 0
Performing sentiment analysis for stock XOM


100%|██████████| 582/582 [00:26<00:00, 22.09it/s]


 Dataframe now contains sentiment score for 291 different dates.
The bot found 2188 headlines from analyst_ratings_processed.csv, regarding MA stock
The bot found 8 headlines from raw_partner_headlines.csv, regarding MA stock
The bot found 2196 headlines in total, regarding MA stock
[*********************100%***********************]  1 of 1 completed
MA financial dataframe dimensions  (2769, 6)
Positive changes : 1531
Negative changes : 1238
No changes : 0
Performing sentiment analysis for stock MA


100%|██████████| 2026/2026 [01:18<00:00, 25.74it/s]


 Dataframe now contains sentiment score for 1044 different dates.
The bot found 10 headlines from analyst_ratings_processed.csv, regarding DIS stock
The bot found 933 headlines from raw_partner_headlines.csv, regarding DIS stock
The bot found 943 headlines in total, regarding DIS stock
[*********************100%***********************]  1 of 1 completed
DIS financial dataframe dimensions  (2769, 6)
Positive changes : 1452
Negative changes : 1317
No changes : 0
Performing sentiment analysis for stock DIS


100%|██████████| 869/869 [00:33<00:00, 25.76it/s]


 Dataframe now contains sentiment score for 133 different dates.
The bot found 1798 headlines from analyst_ratings_processed.csv, regarding PFE stock
The bot found 0 headlines from raw_partner_headlines.csv, regarding PFE stock
The bot found 1798 headlines in total, regarding PFE stock
[*********************100%***********************]  1 of 1 completed
PFE financial dataframe dimensions  (2769, 6)
Positive changes : 1372
Negative changes : 1397
No changes : 0
Performing sentiment analysis for stock PFE


100%|██████████| 1716/1716 [01:14<00:00, 23.03it/s]


 Dataframe now contains sentiment score for 730 different dates.
The bot found 0 headlines from analyst_ratings_processed.csv, regarding CVX stock
The bot found 0 headlines from raw_partner_headlines.csv, regarding CVX stock
The bot found 0 headlines in total, regarding CVX stock
[*********************100%***********************]  1 of 1 completed
CVX financial dataframe dimensions  (2769, 6)
Positive changes : 1407
Negative changes : 1362
No changes : 0
Performing sentiment analysis for stock CVX


0it [00:00, ?it/s]

 Dataframe now contains sentiment score for 0 different dates.





The bot found 539 headlines from analyst_ratings_processed.csv, regarding ABBV stock
The bot found 214 headlines from raw_partner_headlines.csv, regarding ABBV stock
The bot found 753 headlines in total, regarding ABBV stock
[*********************100%***********************]  1 of 1 completed
ABBV financial dataframe dimensions  (2015, 6)
Positive changes : 1072
Negative changes : 943
No changes : 0
Performing sentiment analysis for stock ABBV


100%|██████████| 724/724 [00:34<00:00, 21.02it/s]


 Dataframe now contains sentiment score for 384 different dates.
The bot found 2785 headlines from analyst_ratings_processed.csv, regarding KO stock
The bot found 77 headlines from raw_partner_headlines.csv, regarding KO stock
The bot found 2862 headlines in total, regarding KO stock
[*********************100%***********************]  1 of 1 completed
KO financial dataframe dimensions  (2769, 6)
Positive changes : 1431
Negative changes : 1338
No changes : 0
Performing sentiment analysis for stock KO


 31%|███▏      | 846/2705 [00:34<01:15, 24.66it/s]


KeyboardInterrupt: 

In [None]:
big_df

Unnamed: 0,date,stock,Open,Close,Volume,Positive,Negative,Neutral,Price_change
0,2020-03-09,AAPL,65.937500,66.542503,286744800,0.046127,0.411465,0.542409,-1
1,2020-03-10,AAPL,69.285004,71.334999,285290000,0.070845,0.449025,0.480130,1
2,2020-03-11,AAPL,69.347504,68.857498,255598800,0.190995,0.453761,0.355244,-1
3,2020-03-12,AAPL,63.985001,62.057499,418474000,0.204221,0.447518,0.348261,-1
4,2020-03-13,AAPL,66.222504,69.492500,370732000,0.315863,0.218127,0.466010,1
...,...,...,...,...,...,...,...,...,...
5127,2020-05-28,HD,249.520004,245.139999,5259300,0.145661,0.047349,0.806990,-1
5128,2020-06-02,HD,248.630005,252.710007,5301000,0.349589,0.185349,0.465062,1
5129,2020-06-04,HD,249.770004,248.949997,4759400,0.033993,0.045346,0.920662,-1
5130,2020-06-05,HD,252.339996,254.899994,5054300,0.211447,0.082010,0.706543,1


In [None]:
big_df.to_csv('Datasets/sp500_news.csv')

In [None]:
dates = big_df.date.unique()

In [None]:
len(dates)

2216

In [None]:
big_df.shape

(5132, 9)

In [None]:
sp500 =  pdr.get_data_yahoo("^GSPC", "2009-01-01", "2021-01-01")

[*********************100%***********************]  1 of 1 completed

1 Failed download:
- ^GSPC: No data found for this date range, symbol may be delisted


In [None]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
