# Data collection (Financial data only)

The very first step is to collect data!
The data will be collected and saved in three folders:
- #### End of day data
Historical end of day data for stock prices.

- #### Fundamental data
Fundamental data for a given stock i.e eps and outstanding shares.


In [1]:
!pip install eodhd

Collecting numpy==1.21.6 (from eodhd)
  Using cached numpy-1.21.6-cp310-cp310-win_amd64.whl (14.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.25.2
    Uninstalling numpy-1.25.2:
      Successfully uninstalled numpy-1.25.2


ERROR: Could not install packages due to an OSError: [WinError 5] Adgang nægtet: 'C:\\Users\\Soren\\.conda\\envs\\pydata-book\\Lib\\site-packages\\numpy\\~libs\\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll'
Consider using the `--user` option or check the permissions.



In [1]:
import pandas as pd
import numpy as np
#import pandas_datareader.data as web
from datetime import datetime
from pathlib import Path
import requests
import json
import tqdm
import time
import os
import tqdm
from eodhd import APIClient
import datetime
import matplotlib.pyplot as plt
import pprint
from bs4 import BeautifulSoup as BS


In [2]:
api = "64d77f6d3a60a5.24835840" #API key
eod_path = Path.cwd()/"Eod" #path to create folder where end of day stock prices will be stored
fundamentals_path = Path.cwd()/"Fundamentals"
SingleStock_data = Path.cwd()/"SingleStock_data"
Path.mkdir(eod_path, exist_ok=True)
Path.mkdir(fundamentals_path, exist_ok = True)
Path.mkdir(SingleStock_data, exist_ok=True)

## Define functions

In [4]:
def stock_eod(ticker):
    """
    Get stock data End of Day prices.
    
    Args(str): ticker name of the stock
    
    Returns a pandas dataframe outer-merged on 'datetime' 
    """
    
    ###MAKE API CLALL###
    Timeseries = requests.get(f'https://eodhistoricaldata.com/api/eod/{ticker}.US?order=d&from=2022-01-01&to=2023-07-31&period=d&&api_token={api}&fmt=json')
    
    ### convert json files to pandas dataframes
    timeseries_df = pd.DataFrame(Timeseries.json())
    timeseries_df["date"] = pd.to_datetime(timeseries_df["date"])  #change date format to datetime
    
    stock_df = timeseries_df
    
    return stock_df

def stock_fundamental(ticker):
    
    """
    Collect fundamentals for a stock given its ticker
    
    Args(str): ticker
    
    Returns a pandas dataframe with fundamentals data
    """
    
    #Get all fundamental data for a given ticker 
    url = f'https://eodhistoricaldata.com/api/fundamentals/{ticker}.US?api_token={api}&order=d&filter=outstandingShares::quarterly,Earnings::History'
    resp = requests.get(url) #make request
    json = resp.json() #convert to json

    #get outstandingShares data and convert to dataframe
    out = pd.DataFrame(json["outstandingShares::quarterly"]).T
    out = out.drop(["date", "sharesMln"], axis = 1).rename(columns = {"dateFormatted":"date", "shares":"outstandingShares"})
    out["date"] = pd.to_datetime(out["date"]) #change dateformat to datetime
    out.set_index('date', inplace =True)
    out = out.resample('D').ffill() #Fill values in between days
    out.sort_index(ascending=False, inplace=True)
    out.reset_index(inplace=True)

    #get earnings data and convert to dataframe
    earn = pd.DataFrame(json["Earnings::History"]).T
    earn = earn.drop(["reportDate", "beforeAfterMarket"], axis = 1).reset_index().drop("index",axis = 1)
    earn["date"] = pd.to_datetime(earn["date"]) #change dateformat to datetime
    earn.set_index('date', inplace =True)
    earn = earn.resample('D').ffill() #Fill values in between days
    earn.sort_index(ascending=False, inplace=True)
    earn.reset_index(inplace=True)

    fundamental = earn.merge(out, how = "left", on = "date")
    fundamental = fundamental[(fundamental["date"]<='2023-07-31') & (fundamental["date"]>='2022-01-01')].reset_index()
    
    #write a status to see if all data was gotten. Used in logfile later
    status = "No"
    if fundamental.shape == (577,8):
        status = "Yes"
    else:
        status = "No"
    
    return fundamental, status


def df_to_csv(df, name):
    """
    Save a pandas dataframe into a csv file
    
    Args(pd.DataFrame, str): dataframe and the name of the file it should return.
    When working with stock data name should be the ticker of the specified stock.
    
    requirements = requests, pandas, json should be installed and imported.
    
    returns 0, creates a csv file in /Data/Financial data
    """

    pathname = Path.cwd()/f"SingleStock_data/{name}.csv"
    
    df.to_csv(pathname, index = False)
    
    return None

def log(ticker, df, logfile, accept,output_path=os.getcwd()):
    #open or create the csv file
    if os.path.isfile(logfile): #if log file exist, open and allow changes
        log = open(logfile,'a')
    else:
        log = open(logfile,'w')
        header = ['Timestamp', 'Company', 'shape', 'Accept']
        log.write(";".join(header)+"\n") #Make the headers and jump to the new line
    
    #Gather log information
    company = f"{ticker}"
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #local time
    shape = df.shape
    
    #Open the log file and append the gathered log information
    with open(logfile, 'a') as log:
        log.write(f'{timestamp};{company};{shape};{accept}' + "\n") #Append the information and jump to the new line.

## Get data

In [5]:
# Part of the name that is constant
nasdaq_part = 'nasdaq_screener'

Path_ = os.getcwd()

cwd = Path.cwd()

file_list = os.listdir(Path_)

# Filter files based on the target part of the name since the name can change
filtered_files = [file_name for file_name in file_list if nasdaq_part in file_name and file_name.endswith('.csv')]

# Checking if there is a file with the specified name
if len(filtered_files) == 1:
    selected_file = filtered_files[0]
    full_path = os.path.join(Path_, selected_file)
    print("Selected file:", full_path)
elif len(filtered_files) > 1:
    print("Multiple files found matching the criteria.")
else:
    print("No matching file found.")
    
# Read the file into a dataframe
nasdaq = pd.read_csv(full_path)

Selected file: c:\Users\Soren\Documents\GitHub\IntroSocial23\Exam final version\Scripts and data\nasdaq_screener_1692728308111.csv


In [6]:
#Get list with all US-listed companies in NASDAQ exchange over 50m market cap
nasdaq = nasdaq.sort_values("Market Cap", ascending = False).reset_index().drop("index", axis = 1)
tickers = nasdaq["Symbol"].values
# It is important to notice that depending on the day this list will change, due delistings, reslistings and new listings.
len(tickers)

2174

In [7]:
#User new list of tickers instead
#Note: this block was made after the whole code below was run for the first time. We had to get the data all over again and removed some of the original tickers shown above

tickers = pd.read_csv("TickerNames_2022.csv")["Symbol"].values
tickers = tickers
len(tickers)

2022

In [15]:

logfile = Path.cwd()/'financial_data_logs.csv'
missing = []
index_missing = []
fundamentals_failed = []
fundamentals_index = []

for i, ticker in tqdm.tqdm(enumerate(tickers)):
    try:
        #Get End-of-Day stock price data
        eod = stock_eod(ticker)
        #Get fundamentals
        fund, status = stock_fundamental(ticker)
        if status == "No":
            fundamentals_failed.append(ticker)
            fundamentals_index.append(ticker)
        
        try:
            eod.to_csv(cwd/f'Eod/{ticker}.csv', index = False)  #save end of day data only 
            fund.to_csv(cwd/f'Fundamentals/{ticker}.csv', index = False)  #save fundamental data only

            #merge fundamental and end of day data on date
            stock = eod.merge(fund, how = "left", on = "date")
            df_to_csv(stock,ticker) #save all data for the single stock
            log(ticker, stock, logfile, status) #track all succeded logs
        except:
            print(f"Data for company {ticker} was retrieved from API but not saved")
                
    except:
        print(f'Company {ticker} was not retrived from API')
        missing.append(ticker)
        index_missing.append(i)
        time.sleep(5)
            
    time.sleep(0.5)


2it [00:04,  2.42s/it]

Company GOOG was not retrived from API


5it [00:17,  3.34s/it]

In [12]:
missing_values = pd.DataFrame([missing, index_missing]).T
missing_values.to_csv("missing_stocks.csv", index = False)
fundamentals_missing = pd.DataFrame([fundamentals_failed,fundamentals_index]).T

NameError: name 'missing' is not defined

### Validate datacollection
As shown above there are 70 stocks that could not be retrieved. After some consideration it was decided to drop these stocks because of two reason.
Some of them are just to small or were listed after 2023-06-01, meaning there will be some missing data.

The last argument needs to be validated for all the stocks there is data on, as if some stocks have less available dates or na values then they should be reevaluated.

Therefore, the next block will be about evaluation of stocks

In [13]:
#create a validation log that saves the information of all stocks
def log_val(ticker, df, logfile, output_path=os.getcwd()):
    #open or create the csv file
    if os.path.isfile(logfile): #if log file exist, open and allow changes
        log = open(logfile,'a')
    else:
        log = open(logfile,'w')
        header = ['Timestamp', 'Ticker', 'Shape', 'Accept', 'Nr. NaN values']
        log.write(";".join(header)+"\n") #Make the headers and jump to the new line
    
    accept = "Yes"
    accept_cond = (267,14)
    #Gather log information
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #local time
    shape = df.shape
    if df.shape != accept_cond:
        accept = "No"
    nan_val = sum(df.isna().sum().values)
    
    #Open the log file and append the gathered log information
    with open(logfile, 'a') as log:
        log.write(f'{timestamp};{ticker};{shape};{accept};{nan_val}' + "\n") #Append the information and jump to the new line.
        
#create a validation log that saves the information of all stocks fundamentals
def log_val_fund(ticker, df, logfile, output_path=os.getcwd()):
    #open or create the csv file
    if os.path.isfile(logfile): #if log file exist, open and allow changes
        log = open(logfile,'a')
    else:
        log = open(logfile,'w')
        header = ['Timestamp', 'Ticker', 'Shape', 'Accept', 'Nr. NaN values']
        log.write(";".join(header)+"\n") #Make the headers and jump to the new line
    
    accept = "Yes"
    accept_cond = (365,8) #the shape it should have in the research period
    #Gather log information
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #local time
    shape = df.shape
    if df.shape != accept_cond:
        accept = "No"
    nan_val = sum(df.isna().sum().values)
    
    #Open the log file and append the gathered log information
    with open(logfile, 'a') as log:
        log.write(f'{timestamp};{ticker};{shape};{accept};{nan_val}' + "\n") #Append the information and jump to the new line.

#create a validation log that saves the information of all stocks eod
def log_val_eod(ticker, df, logfile, output_path=os.getcwd()):
    #open or create the csv file
    if os.path.isfile(logfile): #if log file exist, open and allow changes
        log = open(logfile,'a')
    else:
        log = open(logfile,'w')
        header = ['Timestamp', 'Ticker', 'Shape', 'Accept', 'Nr. NaN values']
        log.write(";".join(header)+"\n") #Make the headers and jump to the new line
    
    accept = "Yes"
    accept_cond = (267,7)
    #Gather log information
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #local time
    shape = df.shape
    if df.shape != accept_cond:
        accept = "No"
    nan_val = sum(df.isna().sum().values)
    
    #Open the log file and append the gathered log information
    with open(logfile, 'a') as log:
        log.write(f'{timestamp};{ticker};{shape};{accept};{nan_val}' + "\n") #Append the information and jump to the new line.
        


In [9]:
tickers #all tickers
missing_values = pd.read_csv("missing_stocks.csv")
idxs = missing_values["1"].values #index for missing companies

#### Validation of Eod data collection

In [10]:
aapl = pd.read_csv(eod_path/"AAPL.csv")
aapl["date"]=pd.to_datetime(aapl["date"])
aapl[(aapl["date"]>'2022-05-10') & (aapl["date"]<'2023-06-03')].shape

(267, 7)

In [14]:
#Get data for each stock and check it has the right shape. The right shape will be the same shape as apple stock has (has date on all cells)
aapl = pd.read_csv(eod_path/"AAPL.csv")
print(f"The right shape for at stock is {aapl.shape}")
logval = Path.cwd()/f'Stock_validation_eod.csv'

for i in tqdm.tqdm(range(len(tickers))):
    #check if i stock is one of the 70 missing
    
    df = pd.read_csv(eod_path/f"{tickers[i]}.csv")
    df["date"]=pd.to_datetime(df["date"])
    df = df[(df["date"]>'2022-05-10') & (df["date"]<'2023-06-03')]
    log_val_eod(tickers[i], df, logval)
    

The right shape for at stock is (395, 7)


100%|██████████| 2022/2022 [00:10<00:00, 199.66it/s]


In [None]:
#The commented code below is used in case some companies as missing - this is no longer the case after data extraction number 2
'''    
abval = 0
for j in range(len(idxs)):
        if i==0:
            continue
        elif i==idxs[j]:
            abval = 1
            #print("Missing company found with index: ",i)
    if abval == 1:
        continue'''

In [15]:
#Get log-validation file created above
validation_eod = pd.read_csv("Stock_validation_eod.csv", sep = ";")

#Find all stocks that were not accepted due to data shortage
eod_del = validation_eod[validation_eod["Accept"]=="No"]["Ticker"].values

'''if sum(eod_del)==0:
    print(f"There's complete data for all {len(tickers)} companies!")'''

'if sum(eod_del)==0:\n    print(f"There\'s complete data for all {len(tickers)} companies!")'

In [16]:
#Now we have 70 companies that could not be retrieved for API and 96 that were listed after the given date period.
#Its time to remove these companies from the dataset

#Remove Non-retrieved-data from tickers list:
tickers_new = np.delete(tickers, idxs)

#Remove tickers with incomple datasets from tickers_list
index_eod_remove = []
for i in tqdm.tqdm(range(len(tickers_new))):
    for j in range(len(eod_del)):
        if tickers_new[i] == eod_del[j]:
            index_eod_remove.append(i)
tickers_final = np.delete(tickers_new, index_eod_remove)
len(tickers_final)

#Save as csv file
pd.DataFrame(tickers_final, columns = ["Ticker"]).to_csv("Tickers_final.csv", index = False)

IndexError: index 2099 is out of bounds for axis 0 with size 2022

In [171]:
#Now its time to remove 
#as this code is a bit dangerous i commented it
#THINK CAREFULLY before using!!! it will delete a lot of data

'''eod_succed = []
fundamentals_succed = []
StockSingle_succed = []
check = []

#Delete all files from stocks with data less than one year
for ticker in tqdm.tqdm(tickers_new):
    for j in range(len(eod_del)):
        if ticker==eod_del[j]:
            check.append(ticker)
            if os.path.exists(eod_path/f'{ticker}.csv'):
                os.remove((eod_path/f'{ticker}.csv'))
                eod_succed.append(1)
            else:
                eod_succed.append(0)

            if os.path.exists(fundamentals_path/f'{ticker}.csv'):
                os.remove((fundamentals_path/f'{ticker}.csv'))
                fundamentals_succed.append(1)
            else:
                fundamentals_succed.append(0)

            if os.path.exists(SingleStock_data/f'{ticker}.csv'):
                os.remove((SingleStock_data/f'{ticker}.csv'))
                StockSingle_succed.append(1)
            else:
                StockSingle_succed.append(0)

print("Eod's removed: ",sum(eod_succed))
print("Fundamentals's removed: ",sum(fundamentals_succed))
print("StockSingle's removed: ",sum(StockSingle_succed))'''


100%|██████████| 2118/2118 [00:00<00:00, 15272.68it/s]

Eod's removed:  96
Fundamentals's removed:  96
StockSingle's removed:  96





#### Validation of fundamental data collection

In [18]:
#tickers_final = np.delete(tickers, idxs)
aapl = pd.read_csv(fundamentals_path/"AAPL.csv")
print(f"The right shape for at stock is {aapl.shape}")
logval = Path.cwd()/f'Stock_validation_fundamentals.csv'

for i in tqdm.tqdm(range(len(tickers))):
    df = pd.read_csv(fundamentals_path/f"{tickers[i]}.csv")
    df["date"] = pd.to_datetime(df["date"])
    df = df[(df["date"]>'2022-06-01')&(df["date"]<'2023-06-02')]
    log_val_fund(tickers[i], df, logval)


The right shape for at stock is (577, 8)


100%|██████████| 2022/2022 [00:11<00:00, 178.03it/s]


In [19]:
#Get log-validation file created above
validation_fund = pd.read_csv("Stock_validation_fundamentals.csv", sep = ";")

#Save a variable containing the tickers of datasets that were not accepted (Shape != (368,8))
BadShape = validation_fund[validation_fund["Accept"]=="No"]
BadShape


Unnamed: 0,Timestamp,Ticker,Shape,Accept,Nr. NaN values
94,2023-08-22 21:38:28,SABRP,"(0, 8)",No,0.0
95,2023-08-22 21:38:28,HBANP,"(0, 8)",No,0.0
125,2023-08-22 21:38:28,FITBI,"(0, 8)",No,0.0
135,2023-08-22 21:38:28,FITBP,"(0, 8)",No,0.0
148,2023-08-22 21:38:28,AGNCN,"(0, 8)",No,0.0
...,...,...,...,...,...
1979,2023-08-22 21:38:38,BAFN,"(303, 8)",No,543.0
1994,2023-08-22 21:38:39,MOVE,"(303, 8)",No,545.0
1996,2023-08-22 21:38:39,DUNE,"(303, 8)",No,907.0
1997,2023-08-22 21:38:39,DUNEU,"(0, 8)",No,0.0


### Single Stock data (merged date) validation

In [20]:
print(f"The right shape for at stock is {aapl.shape}")
logval = Path.cwd()/f'Stock_validation_merged.csv'

for i in tqdm.tqdm(range(len(tickers))):
    df = pd.read_csv(SingleStock_data/f"{tickers[i]}.csv")
    df["date"] = pd.to_datetime(df["date"])
    df = df[(df["date"]>'2022-05-10')&(df["date"]<'2023-06-03')]
    log_val(tickers[i], df, logval)


The right shape for at stock is (577, 8)


100%|██████████| 2022/2022 [00:12<00:00, 164.66it/s]


In [21]:
#Get log-validation file created above
validation_merge = pd.read_csv("Stock_validation_merged.csv", sep = ";")

#Save a variable containing the tickers of datasets that were not accepted (Shape != (368,8))
BadShape = validation_merge[validation_merge["Accept"]=="No"]


In [22]:
validation_merge[validation_merge["Accept"]=="No"]

Unnamed: 0,Timestamp,Ticker,Shape,Accept,Nr. NaN values
1075,2023-08-22 21:39:02,MOND,"(266, 14)",No,0
1213,2023-08-22 21:39:03,LIFW,"(258, 14)",No,89
1355,2023-08-22 21:39:03,IVCP,"(265, 14)",No,965
1667,2023-08-22 21:39:05,LCA,"(266, 14)",No,968
1722,2023-08-22 21:39:06,BGXX,"(263, 14)",No,899
1783,2023-08-22 21:39:06,FXNC,"(264, 14)",No,477
1837,2023-08-22 21:39:06,CSBR,"(266, 14)",No,0
1972,2023-08-22 21:39:07,ITAQ,"(265, 14)",No,967
