In [173]:
import pandas as pd
import numpy as np
import pandas_datareader.data as web
from datetime import datetime
from pathlib import Path
import requests
import nasdaqdatalink
import json
import tqdm
import time
import os
import tqdm
from eodhd import APIClient
import datetime
import matplotlib.pyplot as plt

In [150]:
api = "64d77f6d3a60a5.24835840" #API key
eod_path = Path.cwd()/"Eod" #path to create folder where end of day stock prices will be stored
fundamentals_path = Path.cwd()/"Fundamentals"
SingleStock_data = Path.cwd()/"SingleStock_data"
Path.mkdir(eod_path, exist_ok=True)
Path.mkdir(fundamentals_path, exist_ok = True)
Path.mkdir(SingleStock_data, exist_ok=True)

## Define functions

In [3]:
def convert_json(response):
    """
    Converts a json format variable to a pandasdataframe.
    
    Args: Response from API call
    
    returns: pandas dataframe
    """
    df = pd.DataFrame(response.json()['values'])
    #df['datetime'] = pd.to_datetime(df['datetime'])
    
    return df

def stock_eod(ticker):
    """
    Get stock data End of Day prices.
    
    Args(str): ticker name of the stock
    
    Returns a pandas dataframe outer-merged on 'datetime' 
    """
    
    ###MAKE API CLALL###
    Timeseries = requests.get(f'https://eodhistoricaldata.com/api/eod/{ticker}.US?order=d&from=2022-05-31&to=2023-06-03&period=d&&api_token={api}&fmt=json')
    
    ### convert json files to pandas dataframes
    timeseries_df = pd.DataFrame(Timeseries.json())
    timeseries_df["date"] = pd.to_datetime(timeseries_df["date"])  #change date format to datetime
    
    stock_df = timeseries_df
    
    return stock_df

def stock_fundamental(ticker):
    
    """
    Collect fundamentals for a stock given its ticker
    
    Args(str): ticker
    
    Returns a pandas dataframe with fundamentals data
    """
    
    #Get all fundamental data for a given ticker 
    url = f'https://eodhistoricaldata.com/api/fundamentals/{ticker}.US?api_token={api}&order=d&filter=outstandingShares::quarterly,Earnings::History'
    resp = requests.get(url) #make request
    json = resp.json() #convert to json
    
    #get outstandingShares data and convert to dataframe
    out = pd.DataFrame(json["outstandingShares::quarterly"]).T
    out = out.drop(["date", "sharesMln"], axis = 1)[:6].rename(columns = {"dateFormatted":"date", "shares":"outstandingShares"})
    out["date"] = pd.to_datetime(out["date"]) #change dateformat to datetime
    
    #get earnings data and convert to dataframe
    earn = pd.DataFrame(json["Earnings::History"]).T
    earn = earn.drop(["reportDate", "beforeAfterMarket"], axis = 1)[4:10].reset_index().drop("index",axis = 1)
    earn["date"] = pd.to_datetime(earn["date"]) #change dateformat to datetime

    #merge both dataframes into one dataframe
    fund = earn.merge(out, how = "inner", on = "date")
    fund.set_index('date', inplace = True) #set date as index
    fundamental = fund.resample('D').ffill() #resample dataset so it expands from quaterly to daily
    fundamental.sort_index(ascending = False, inplace = True) #sort data to descending by date
    fundamental.reset_index(inplace=True) #put date back as column - necessary for joining and merging later

    fundamental = fundamental[(fundamental["date"]<='2023-06-02') & (fundamental["date"]>='2022-05-31')].reset_index()
    
    return fundamental


def df_to_csv(df, name):
    """
    Save a pandas dataframe into a csv file
    
    Args(pd.DataFrame, str): dataframe and the name of the file it should return.
    When working with stock data name should be the ticker of the specified stock.
    
    requirements = requests, pandas, json should be installed and imported.
    
    returns 0, creates a csv file in /Data/Financial data
    """

    pathname = Path.cwd()/f"SingleStock_data/{name}.csv"
    
    df.to_csv(pathname, index = False)
    
    return None

def log(ticker, df, logfile, output_path=os.getcwd()):
    #open or create the csv file
    if os.path.isfile(logfile): #if log file exist, open and allow changes
        log = open(logfile,'a', enconding = "utf-8")
    else:
        log = open(logfile,'w')
        header = ['timestamp', 'Status', 'length', 'output_file']
        log.write(";".join(header)+"\n") #Make the headers and jump to the new line
    
    #Gather log information
    status_code = f"last call made happened in ticker {ticker}"
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #local time
    length = len(df) #Lenght of HTML string
    
    #Open the log file and append the gathered log information
    with open(logfile, 'a') as log:
        log.write(f'{timestamp};{status_code};{length}' + "\n") #Append the information and jump to the new line.

## Get data

In [7]:
#Get list with all US-listed companies in NASDAQ exchange over 50m market cap
cwd = Path.cwd() #current working directory
nasdaq = pd.read_csv("Nasdaq mkt. cap 50m+.csv")
nasdaq = nasdaq.sort_values("Market Cap", ascending = False).reset_index().drop("index", axis = 1)
tickers = nasdaq["Symbol"].values
len(tickers)

2188

In [5]:

logfile = Path.cwd()/f'financial_data_logs.csv'
missing = []
index_missing = []

for i, ticker in tqdm.tqdm(enumerate(tickers)):
    try:
        #Get End-of-Day stock price data
        eod = stock_eod(ticker)
        #Get fundamentals
        fund = stock_fundamental(ticker)
        
        try:
            eod.to_csv(cwd/f'Eod/{ticker}.csv', index = False)  #save end of day data only 
            fund.to_csv(cwd/f'Fundamentals/{ticker}.csv', index = False)  #save fundamental data only

            #merge fundamental and end of day data on date
            stock = eod.merge(fund, how = "inner", on = "date")
            df_to_csv(stock,ticker) #save all data for the single stock
            #log(ticker, stock, logfile) #track all succeded logs
        except:
            print(f"Data for company {ticker} was retrieved from API but not saved")
                
    except:
        print(f'Company {ticker} was not retrived from API')
        missing.append(ticker)
        index_missing.append(i)
        time.sleep(60)
        
    
    time.sleep(0.5)
    

87it [03:11,  2.31s/it]

Company HBANM was not retrived from API


160it [06:52,  2.08s/it]

Company PARAP was not retrived from API


170it [08:12,  2.85s/it]

Company AGNCL was not retrived from API


216it [10:54,  2.22s/it]

Company CHKEW was not retrived from API


226it [12:15,  2.91s/it]

Company CHKEZ was not retrived from API


244it [13:53,  2.12s/it]

Company CHKEL was not retrived from API


273it [15:53,  2.05s/it]

Company ONBPO was not retrived from API


275it [16:57, 14.69s/it]

Company ONBPP was not retrived from API


289it [18:32,  3.24s/it]

Company CGABL was not retrived from API


391it [23:25,  2.32s/it]

Company OXLCN was not retrived from API


397it [24:38,  5.15s/it]

Company OXLCZ was not retrived from API


470it [28:19,  2.13s/it]

Company DHCNI was not retrived from API


508it [30:41,  2.26s/it]

Company RTLPO was not retrived from API


522it [32:14,  2.43s/it]

Company FULTP was not retrived from API


568it [35:07,  2.86s/it]

Company FTAIN was not retrived from API


571it [36:13, 11.13s/it]

Company PACWP was not retrived from API


576it [37:24,  7.95s/it]

Company CSQ was not retrived from API


669it [41:46,  2.04s/it]

Company BPOPM was not retrived from API


741it [45:17,  2.08s/it]

Company FRMEP was not retrived from API


856it [50:44,  2.18s/it]

Company CDZIP was not retrived from API


872it [52:19,  2.32s/it]

Company MBINM was not retrived from API


882it [53:44,  3.54s/it]

Company SIGIP was not retrived from API


891it [55:05,  3.24s/it]

Company WAFDP was not retrived from API


895it [56:13,  8.56s/it]

Company RWAYZ was not retrived from API


898it [57:20, 13.15s/it]

Company RWAYL was not retrived from API


915it [58:59,  2.39s/it]

Company MBINO was not retrived from API


923it [1:00:18,  3.79s/it]

Company TRINL was not retrived from API


928it [1:01:32,  7.38s/it]

Company TCBIO was not retrived from API


940it [1:03:01,  2.80s/it]

Company HROWM was not retrived from API


948it [1:04:21,  3.86s/it]

Company FOSLL was not retrived from API


964it [1:05:57,  2.31s/it]

Company LIFWZ was not retrived from API


978it [1:07:30,  3.08s/it]

Company RUMBW was not retrived from API


997it [1:09:27,  3.73s/it]

Company MBINN was not retrived from API


998it [1:10:30, 21.25s/it]

Company GAINN was not retrived from API


1004it [1:11:43,  7.66s/it]

Company HPKEW was not retrived from API


1021it [1:13:23,  2.68s/it]

Company RILYO was not retrived from API


1023it [1:14:27, 15.06s/it]

Company GAINZ was not retrived from API


1056it [1:16:44,  2.42s/it]

Company RILYK was not retrived from API


1068it [1:18:11,  2.56s/it]

Company METCB was not retrived from API


1071it [1:19:17, 11.23s/it]

Company CNOBP was not retrived from API


1079it [1:20:36,  4.38s/it]

Company OPINL was not retrived from API


1131it [1:23:35,  2.46s/it]

Company SSSSL was not retrived from API


1138it [1:24:52,  4.71s/it]

Company GEGGL was not retrived from API


1172it [1:27:12,  2.16s/it]

Company CHSCL was not retrived from API


1198it [1:29:18,  2.51s/it]

Company MSBIP was not retrived from API


1224it [1:31:16,  2.22s/it]

Company TBLD was not retrived from API


1230it [1:32:32,  5.73s/it]

Company BWBBP was not retrived from API


1236it [1:33:45,  5.81s/it]

Company CHSCM was not retrived from API


1243it [1:35:05,  5.13s/it]

Company TFINP was not retrived from API


1285it [1:37:50,  2.47s/it]

Company CHSCN was not retrived from API


1324it [1:40:18,  2.37s/it]

Company THWWW was not retrived from API


1341it [1:41:57,  2.26s/it]

Company SEATW was not retrived from API


1381it [1:44:23,  2.18s/it]

Company CCLDO was not retrived from API


1389it [1:45:40,  3.62s/it]

Company GREEL was not retrived from API


1395it [1:46:53,  5.43s/it]

Company ATLCL was not retrived from API


1418it [1:48:57,  3.60s/it]

Company EVLVW was not retrived from API


1444it [1:50:57,  2.20s/it]

Company CHSCO was not retrived from API


1456it [1:52:28,  2.95s/it]

Company OFSSH was not retrived from API


1491it [1:54:56,  2.65s/it]

Company INDIW was not retrived from API


1539it [1:57:43,  2.35s/it]

Company WESTW was not retrived from API


1614it [2:01:24,  2.09s/it]

Company FGBIP was not retrived from API


1726it [2:06:42,  2.16s/it]

Company AVPTW was not retrived from API


1742it [2:08:19,  2.20s/it]

Company SCLXW was not retrived from API


1901it [2:15:25,  2.29s/it]

Company SOUNW was not retrived from API


1924it [2:17:14,  2.06s/it]

Company LNZAW was not retrived from API


1975it [2:20:09,  2.18s/it]

Company EOSEW was not retrived from API


2015it [2:22:40,  2.24s/it]

Company CMPOW was not retrived from API


2099it [2:26:51,  2.21s/it]

Company ADVWW was not retrived from API


2113it [2:28:21,  2.30s/it]

Company GCMGW was not retrived from API


2165it [2:31:19,  2.17s/it]

Company MYPSW was not retrived from API


2188it [2:33:14,  4.20s/it]


In [126]:
missing_values = pd.DataFrame([missing, index_missing]).T
#

NameError: name 'missing' is not defined

### Validate datacollection
As shown above there are 70 stocks that could not be retrieved. After some consideration it was decided to drop these stocks because of two reason.
Some of them are just to small or were listed after 2023-06-01, meaning there will be some missing data.

The last argument needs to be validated for all the stocks there is data on, as if some stocks have less available dates or na values then they should be reevaluated.

In [144]:
#create a validation log that saves the information of all stocks
def log_val(ticker, df, logfile, output_path=os.getcwd()):
    #open or create the csv file
    if os.path.isfile(logfile): #if log file exist, open and allow changes
        log = open(logfile,'a')
    else:
        log = open(logfile,'w')
        header = ['Timestamp', 'Ticker', 'Shape', 'Accept', 'Nr. NaN values']
        log.write(";".join(header)+"\n") #Make the headers and jump to the new line
    
    accept = "Yes"
    accept_cond = (254,14)
    #Gather log information
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #local time
    shape = df.shape
    if df.shape != accept_cond:
        accept = "No"
    nan_val = sum(df.isna().sum().values)
    
    #Open the log file and append the gathered log information
    with open(logfile, 'a') as log:
        log.write(f'{timestamp};{ticker};{shape};{accept};{nan_val}' + "\n") #Append the information and jump to the new line.
        
#create a validation log that saves the information of all stocks fundamentals
def log_val_fund(ticker, df, logfile, output_path=os.getcwd()):
    #open or create the csv file
    if os.path.isfile(logfile): #if log file exist, open and allow changes
        log = open(logfile,'a')
    else:
        log = open(logfile,'w')
        header = ['Timestamp', 'Ticker', 'Shape', 'Accept', 'Nr. NaN values']
        log.write(";".join(header)+"\n") #Make the headers and jump to the new line
    
    accept = "Yes"
    accept_cond = (368,8)
    #Gather log information
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #local time
    shape = df.shape
    if df.shape != accept_cond:
        accept = "No"
    nan_val = sum(df.isna().sum().values)
    
    #Open the log file and append the gathered log information
    with open(logfile, 'a') as log:
        log.write(f'{timestamp};{ticker};{shape};{accept};{nan_val}' + "\n") #Append the information and jump to the new line.

#create a validation log that saves the information of all stocks eod
def log_val_eod(ticker, df, logfile, output_path=os.getcwd()):
    #open or create the csv file
    if os.path.isfile(logfile): #if log file exist, open and allow changes
        log = open(logfile,'a')
    else:
        log = open(logfile,'w')
        header = ['Timestamp', 'Ticker', 'Shape', 'Accept', 'Nr. NaN values']
        log.write(";".join(header)+"\n") #Make the headers and jump to the new line
    
    accept = "Yes"
    accept_cond = (254,7)
    #Gather log information
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #local time
    shape = df.shape
    if df.shape != accept_cond:
        accept = "No"
    nan_val = sum(df.isna().sum().values)
    
    #Open the log file and append the gathered log information
    with open(logfile, 'a') as log:
        log.write(f'{timestamp};{ticker};{shape};{accept};{nan_val}' + "\n") #Append the information and jump to the new line.
        


In [127]:
tickers #all tickers
missing_values = pd.read_csv("missing_stocks.csv")
idxs = missing_values["1"].values #index for missing companies

#### Validation of Eod data collection

In [110]:
#Get data for each stock and check it has the right shape. The right shape will be the same shape as apple stock has (has date on all cells)
aapl = pd.read_csv(eod_path/"AAPL.csv")
print(f"The right shape for at stock is {aapl.shape}")
logval = Path.cwd()/f'Stock_validation_eod.csv'

for i in tqdm.tqdm(range(len(tickers))):
    #check if i stock is one of the 70 missing
    abval = 0
    for j in range(len(idxs)):
        if i==0:
            continue
        elif i==idxs[j]:
            abval = 1
            #print("Missing company found with index: ",i)
    if abval == 1:
        continue
    df = pd.read_csv(eod_path/f"{tickers[i]}.csv")
    log_val_eod(tickers[i], df, logval)


The right shape for at stock is (254, 7)


100%|██████████| 2188/2188 [00:18<00:00, 120.05it/s]


In [177]:
#Get log-validation file created above
validation_eod = pd.read_csv("Stock_validation_eod.csv", sep = ";")

#Find all stocks that were not accepted due to data shortage
eod_del = validation_eod[validation_eod["Accept"]=="No"]["Ticker"].values

In [135]:
#Now we have 70 companies that could not be retrieved for API and 96 that were listed after the given date period.
#Its time to remove these companies from the dataset

#Remove Non-retrieved-data from tickers list:
tickers_new = np.delete(tickers, idxs)

#Remove tickers with incomple datasets from tickers_list
index_eod_remove = []
for i in tqdm.tqdm(range(len(tickers_new))):
    for j in range(len(eod_del)):
        if tickers_new[i] == eod_del[j]:
            index_eod_remove.append(i)
tickers_final = np.delete(tickers_new, index_eod_remove)
len(tickers_final)

#Save as csv file
pd.DataFrame(tickers_final, columns = ["Ticker"]).to_csv("Tickers_final.csv", index = False)

100%|██████████| 2118/2118 [00:00<00:00, 53602.78it/s]


2022

In [171]:
#Now its time to remove 

eod_succed = []
fundamentals_succed = []
StockSingle_succed = []
check = []

#Delete all files from stocks with data less than one year
for ticker in tqdm.tqdm(tickers_new):
    for j in range(len(eod_del)):
        if ticker==eod_del[j]:
            check.append(ticker)
            if os.path.exists(eod_path/f'{ticker}.csv'):
                os.remove((eod_path/f'{ticker}.csv'))
                eod_succed.append(1)
            else:
                eod_succed.append(0)

            if os.path.exists(fundamentals_path/f'{ticker}.csv'):
                os.remove((fundamentals_path/f'{ticker}.csv'))
                fundamentals_succed.append(1)
            else:
                fundamentals_succed.append(0)

            if os.path.exists(SingleStock_data/f'{ticker}.csv'):
                os.remove((SingleStock_data/f'{ticker}.csv'))
                StockSingle_succed.append(1)
            else:
                StockSingle_succed.append(0)

print("Eod's removed: ",sum(eod_succed))
print("Fundamentals's removed: ",sum(fundamentals_succed))
print("StockSingle's removed: ",sum(StockSingle_succed))


100%|██████████| 2118/2118 [00:00<00:00, 15272.68it/s]

Eod's removed:  96
Fundamentals's removed:  96
StockSingle's removed:  96





#### Validation of fundamental data collection

In [181]:
aapl = pd.read_csv(eod_path/"AAPL.csv")
print(f"The right shape for at stock is {aapl.shape}")
logval = Path.cwd()/f'Stock_validation_fundamentals.csv'

for i in tqdm.tqdm(range(len(tickers_final))):
    #check if i stock is one of the 70 missing
    df = pd.read_csv(fundamentals_path/f"{tickers_final[i]}.csv")
    log_val_fund(tickers_final[i], df, logval)

The right shape for at stock is (254, 7)


100%|██████████| 2022/2022 [00:05<00:00, 341.27it/s]


In [188]:
#Get log-validation file created above
validation_fund = pd.read_csv("Stock_validation_fundamentals.csv", sep = ";")

#Save a variable containing the tickers of datasets that were not accepted (Shape != (368,8))
BadShape = validation_fund[validation_fund["Accept"]=="No"]
BadShape["Shape"].value_counts()

(0, 8)      285
(305, 8)    141
(123, 8)    138
(215, 8)     24
(31, 8)      15
(338, 8)      3
(64, 8)       2
(246, 8)      2
(185, 8)      1
Name: Shape, dtype: int64

In [190]:
BadShape

Unnamed: 0,Timestamp,Ticker,Shape,Accept,Nr. NaN values
2,2023-08-16 15:03:26,GOOG,"(305, 8)",No,0.0
5,2023-08-16 15:03:26,NVDA,"(0, 8)",No,0.0
8,2023-08-16 15:03:26,AVGO,"(0, 8)",No,0.0
9,2023-08-16 15:03:26,PEP,"(305, 8)",No,0.0
10,2023-08-16 15:03:26,COST,"(0, 8)",No,0.0
...,...,...,...,...,...
2015,2023-08-16 15:03:31,VWE,"(123, 8)",No,123.0
2016,2023-08-16 15:03:31,RCAT,"(0, 8)",No,0.0
2018,2023-08-16 15:03:31,ESOA,"(305, 8)",No,642.0
2019,2023-08-16 15:03:31,CURI,"(305, 8)",No,0.0


### Get missing data