In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os

In [5]:
# Define the path to the overview of companies
file_path = Path.cwd()/"Nasdaq mkt. cap 50m+.csv"
cwd = Path.cwd()

# Read the csv file
df = pd.read_csv(file_path)
df = df[["Symbol", "Market Cap"]]
df = df.reset_index(drop = True)

In [6]:
# Create a dictionary of the symbols and market caps
dictionary = dict(zip(df["Symbol"],df["Market Cap"]))

In [7]:
# Read final df and map market cap
data = pd.read_csv(cwd/'Final_df.csv', sep = ';')
data = data.drop("Unnamed: 0", axis = 1)
data["Market Cap"] = data["Ticker"].map(dictionary)

final_df = data.copy()

In [8]:
# Import df_all_news_from_API.csv
df_all_news_from_API = pd.read_csv(cwd/'df_all_news_final.csv', sep = ';')

In [9]:
# df_all_news_from_API

# Merge data and df_all_news_from_API data using Ticker as key
# data_merged = pd.merge(df_all_news_from_API,data, left_on = 'ticker', right_on = 'Ticker', how = "left")
df_all_news_from_API["Market Cap"] = df_all_news_from_API["ticker"].map(dictionary)


In [10]:
data_merged = df_all_news_from_API

# Filter all tickers with 1000 news, since the maximum number of news from EODHD is 1000 per ticker
# Which would give a period with no news for the ticker
df_filter = data_merged.groupby("ticker")['title'].count().sort_values(ascending = False).reset_index()
df_filter = df_filter[(df_filter['title'] > 999)]
tickers = df_filter['ticker'].reset_index(drop = True)

tickers

0       AMD
1      INTC
2     CMCSA
3      NFLX
4      ATVI
5       AAL
6      MRNA
7      SBUX
8      META
9      QCOM
10      PEP
11     COST
12     GOOG
13    GOOGL
14     AMZN
15     COIN
16     MSFT
17     PYPL
18     NVDA
19       MU
20     TSLA
21     ABNB
22     AAPL
23      UAL
Name: ticker, dtype: object

In [11]:
# datetime
data['date'] = pd.to_datetime(data['date'])


# For while to print plots of each ticker to see for how long the news are available
# If they only are missing 1 month they will still be included in the analysis
data1 = data.copy()

for ticker in tickers:
    data = data1[data1['Ticker'] == ticker]
    
    plt.figure(figsize = (20,10))
    plt.plot(data['date'],data['LLM_score_sum'], label = ticker)
    plt.legend()
    plt.savefig(ticker + '.png')
    plt.close()



In [12]:
# ATVI
tickers_excluded = tickers.drop([2,4,9,15])
tickers_excluded.reset_index(drop = True)

# Tickers excluded saved to csv
tickers_excluded.to_csv(cwd/'tickers_excluded.csv', sep = ';')


In [13]:
tickers_excluded

0       AMD
1      INTC
3      NFLX
5       AAL
6      MRNA
7      SBUX
8      META
10      PEP
11     COST
12     GOOG
13    GOOGL
14     AMZN
16     MSFT
17     PYPL
18     NVDA
19       MU
20     TSLA
21     ABNB
22     AAPL
23      UAL
Name: ticker, dtype: object

In [14]:
# List with excluded companies
exclusion_list = tickers_excluded.tolist()

master_df = final_df[~final_df.Ticker.isin(exclusion_list)]


Unnamed: 0,date,open,high,low,adjusted_close,volume,RSI,MFI,volatility,returns,prev_returns,returns_pred,Ticker,LLM_score_sum,LLM_score_mean,Market Cap
0,2023-06-02,8.01,8.0900,7.8700,8.0600,74700,66.502362,48.430352,0.029908,2.025316,-1.250000,0.868486,AADI,0.0,0.0,1.456480e+08
2,2023-06-02,2.21,2.2700,2.1600,2.2700,148200,26.678504,47.236530,0.089979,2.714932,-0.450450,2.643172,AAOI,0.0,0.0,4.767921e+08
3,2023-06-02,88.82,91.6800,88.7300,91.5033,243389,36.479318,50.327996,0.016983,4.459876,1.223862,-0.458563,AAON,0.0,0.0,5.236064e+09
5,2023-06-02,33.23,34.9400,32.8800,34.4575,462984,43.577750,72.456425,0.023756,6.524891,2.946755,-3.091635,ABCB,0.0,0.0,2.983360e+09
6,2023-06-02,3.09,3.4500,3.0200,3.3100,236500,29.923428,31.135061,0.059234,8.169935,-0.970874,21.752266,ABEO,0.0,0.0,7.723832e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494863,2022-06-01,33.26,33.3400,32.2600,32.9700,557900,68.832023,57.020542,0.033060,0.487656,-3.300914,3.639672,ZUMZ,-1.0,-1.0,3.625572e+08
494864,2022-06-01,4.64,4.7050,4.4900,4.5700,257036,45.655482,44.451853,0.035933,-1.295896,-1.279318,3.501094,ZVRA,0.0,0.0,1.829617e+08
494865,2022-06-01,6.85,7.1100,6.6450,6.9800,996500,59.232450,56.263023,0.052836,3.869048,-2.183406,-3.295129,ZYME,0.0,0.0,5.080219e+08
494866,2022-06-01,0.97,0.9899,0.9021,0.9050,237207,39.897928,87.930630,0.087510,-5.483029,-1.875384,7.182320,ZYNE,0.0,0.0,7.243086e+07


In [15]:
# Save master_df to csv
master_df.to_csv(cwd/'master_df.csv', sep = ';')