In [None]:
# The aim of this module is to extract data from yahood finance and export to pipe delimitered CSV files

# Exports are:
# R_ALL_TICKER >> list of all tickers in scope
# R_ALL_INFO >> point in time information
# R_ALL_FIN >> financial information, balance sheet, income statement and cashflow
# R_ALL_Prices >> historical prices over last x months
# R_ALL_Div >> dividend price over time


In [1]:
import pandas as pd
import yfinance as yf
import numpy as np


# File name convention
# R_ = raw extract 
# E_ = Entry files into enrichment layer
# C_ = enriched layer with calculation
# I = insights layer, designed for model baselines
# V_ = validation

In [2]:
#ASX Tickers
RAW_TICKER_ASX = pd.read_csv('https://www.asx.com.au/asx/research/ASXListedCompanies.csv',skiprows=1)
RAW_TICKER_ASX["Ticker"] = RAW_TICKER_ASX['ASX code']+".AX"
TICKER_ASX = list(RAW_TICKER_ASX['Ticker'].dropna())

In [None]:
# Nasdaq Tickers
#ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt
#ftp.nasdaqtrader.com/SymbolDirectory/otherlisted.txt

RAW_TICKER_NDQ = pd.read_csv('/Users/joezhou/Downloads/nasdaqlisted.txt',sep="|")
RAW_TICKER_NDQ["Ticker"] = RAW_TICKER_NDQ['Symbol']
TICKER_NDQ = list(RAW_TICKER_NDQ['Ticker'].dropna())

In [4]:
#London Stock Exchange Tickers
RAW_TICKER_LSE = pd.read_excel (r'https://docs.londonstockexchange.com/sites/default/files/documents/list_of_sets_securities_14.xls', sheet_name='SETS',skiprows=3)
RAW_TICKER_LSE["Ticker"] = RAW_TICKER_LSE['Mnemonic']+".L"
TICKER_LSE = list(RAW_TICKER_LSE['Ticker'].dropna())

In [None]:
#NYSE Tickers
# RAW_TICKER_NYSE = pd.read_excel (r'https://www.theice.com/publicdocs/data/NYSE_Equity_Index_Ticker_List.xlsx', sheet_name='SETS',skiprows=3)

#Note: Need to find a better data source

In [5]:
#Append list for data extraction layer
TICKER_ALL = TICKER_ASX
# TICKER_ALL = (TICKER_ASX + TICKER_NDQ + TICKER_LSE)

TICKER_ALL.sort()
four_split = np.array_split(TICKER_ALL, 4)

TICKER_ALL_1 = four_split[0]
TICKER_ALL_2 = four_split[1]
TICKER_ALL_3 = four_split[2]
TICKER_ALL_4 = four_split[3]

#Remove Duplicates
# TICKER_ALL = list(dict.fromkeys(TICKER_ALL))


# Export all ticker list for referencing purposes
df_x = pd.DataFrame(TICKER_ALL, columns=["colummn"])
df_x.to_pickle("/Users/joezhou/Downloads/R_ALL_TICKER.pkl")


In [None]:
#Sample top 5 for testing code purposes
# TICKER_ASXs = TICKER_ASX[:5]
# TICKER_NDQs = TICKER_NDQ[:5]
# TICKER_LSEs = TICKER_LSE[:5]

# TICKER_ALL = (TICKER_ASXs + TICKER_NDQs + TICKER_LSEs)

In [None]:
#Download part 1: Company Information

INF_DF = pd.DataFrame()

def Info_Extract(TickName):    
    global INF_DF
    try:
        tick = yf.Ticker(TickName) 
        INF = tick.info
        INF_DF = INF_DF.append(INF, ignore_index=True) 
         
        print("finished Info:",TickName)
        
    except Exception:
        print("Error with Info:",TickName)
        pass
    
    return()

# Download all
# for company in TICKER_ALL:
#    Info_Extract(company)


# download partial
# for company in TICKER_ALL_1:
#    Info_Extract(company)
# INF_DF_1 = INF_DF

# for company in TICKER_ALL_2:
#    Info_Extract(company)
# INF_DF_2 = INF_DF

# for company in TICKER_ALL_3:
#    Info_Extract(company)
# INF_DF_3 = INF_DF

# for company in TICKER_ALL_4:
#    Info_Extract(company)
# INF_DF_4 = INF_DF

# INF_DF_Master = pd.concat([INF_DF_1, INF_DF_2, INF_DF_3, INF_DF_4], ignore_index=True)

# for company in TICKER_ALL:
    # Info_Extract(company)
# INF_DF.to_pickle("/Users/joezhou/Downloads/R_ALL_INFO.pkl")

# INF_DF_1.to_pickle("/Users/joezhou/Downloads/R_ALL_INFO_1.pkl")
# INF_DF_2.to_pickle("/Users/joezhou/Downloads/R_ALL_INFO_2.pkl")
# INF_DF_3.to_pickle("/Users/joezhou/Downloads/R_ALL_INFO_3.pkl")
# INF_DF_4.to_pickle("/Users/joezhou/Downloads/R_ALL_INFO_4.pkl")


In [6]:
#Download part 2: financial data

FINANCIALS_DF = pd.DataFrame()

def Financials_Extract(TickName):
    global FINANCIALS_DF
    try:
        tick = yf.Ticker(TickName) 
        
        #income statement
        FIN_OG = tick.financials
        FIN_OG2 = FIN_OG.reset_index()
        FIN_OG2.rename(columns = {'index':'Metric'}, inplace = True)
        FIN_OG3=FIN_OG2.melt(id_vars=["Metric"],var_name="Date",value_name="Value")
        FIN_OG3['TickName']=TickName
        FIN_OG3['Financial Data Type']="Income Statement"
        
        FINANCIALS_DF = FINANCIALS_DF.append(FIN_OG3, ignore_index=True)
         
        #balance sheet
        BS_OG = tick.balance_sheet
        BS_OG2 = BS_OG.reset_index()
        BS_OG2.rename(columns = {'index':'Metric'}, inplace = True)
        BS_OG3=BS_OG2.melt(id_vars=["Metric"],var_name="Date",value_name="Value")
        BS_OG3['TickName']=TickName
        BS_OG3['Financial Data Type']="Balance Sheet"
        
        FINANCIALS_DF = FINANCIALS_DF.append(BS_OG3, ignore_index=True)
        
        #cashflow
        CF_OG = tick.cashflow
        CF_OG2 = CF_OG.reset_index()
        CF_OG2.rename(columns = {'index':'Metric'}, inplace = True)
        CF_OG3=CF_OG2.melt(id_vars=["Metric"],var_name="Date",value_name="Value")
        CF_OG3['TickName']=TickName
        CF_OG3['Financial Data Type']="Cashflow"
        
        FINANCIALS_DF = FINANCIALS_DF.append(CF_OG3, ignore_index=True)
        
            
        print("finished Financial:",TickName)
        
    except Exception:
        print("Error with Financial:",TickName)
        pass

    return()

# download all at once
# for company in TICKER_ALL:
#    Financials_Extract(company)


#  download partial
# for company in TICKER_ALL_1:
#    Financials_Extract(company)
# FINANCIALS_DF_1 = FINANCIALS_DF
# FINANCIALS_DF_1.to_pickle("/Users/joezhou/Downloads/R_ALL_FIN_1.pkl")

for company in TICKER_ALL_2:
   Financials_Extract(company)
FINANCIALS_DF_2 = FINANCIALS_DF
FINANCIALS_DF_2.to_pickle("/Users/joezhou/Downloads/R_ALL_FIN_2.pkl")

# for company in TICKER_ALL_3:
#    Financials_Extract(company)
# FINANCIALS_DF_3 = FINANCIALS_DF
# FINANCIALS_DF_3.to_pickle("/Users/joezhou/Downloads/R_ALL_FIN_3.pkl")

# for company in TICKER_ALL_4:
#    Financials_Extract(company)
# FINANCIALS_DF_4 = FINANCIALS_DF
# FINANCIALS_DF_4.to_pickle("/Users/joezhou/Downloads/R_ALL_FIN_4.pkl")

# INF_DF_Master = pd.concat([FINANCIALS_DF_1, FINANCIALS_DF_2, FINANCIALS_DF_3, FINANCIALS_DF_4], ignore_index=True)

# for company in TICKER_ALL:
    # Financials_Extract(company)
# FINANCIALS_DF.to_csv('/Users/joezhou/Downloads/R_ALL_FIN.csv', sep='|', index=False)
# FINANCIALS_DF.to_pickle("/Users/joezhou/Downloads/R_ALL_FIN.pkl")



finished Financial: 14D.AX
finished Financial: 1AD.AX
finished Financial: 1AE.AX
finished Financial: 1AG.AX
finished Financial: 1MC.AX
finished Financial: 1ST.AX
finished Financial: 1VG.AX
finished Financial: 29M.AX
finished Financial: 2BE.AX
finished Financial: 360.AX
finished Financial: 3DA.AX
finished Financial: 3DP.AX
finished Financial: 3MF.AX
finished Financial: 3PL.AX
finished Financial: 4DS.AX
finished Financial: 4DX.AX
finished Financial: 5EA.AX
finished Financial: 5GG.AX
finished Financial: 88E.AX
finished Financial: 8CO.AX
finished Financial: 8IH.AX
finished Financial: 8VI.AX
finished Financial: 92E.AX
finished Financial: 99L.AX
finished Financial: 9SP.AX
finished Financial: A1G.AX
finished Financial: A1M.AX
finished Financial: A2B.AX
finished Financial: A2M.AX
finished Financial: A3D.AX
finished Financial: A4N.AX
finished Financial: A8G.AX
finished Financial: AAC.AX
finished Financial: AAJ.AX
finished Financial: AAP.AX
finished Financial: AAR.AX
finished Financial: AAU.AX
E

In [7]:
# FINANCIALS_DF_1.to_pickle("/Users/joezhou/Downloads/R_ALL_FIN_1.pkl")
# FINANCIALS_DF_3.to_pickle("/Users/joezhou/Downloads/R_ALL_FIN_3.pkl")
# FINANCIALS_DF_4.to_pickle("/Users/joezhou/Downloads/R_ALL_FIN_4.pkl")

In [None]:
#Download part 3: Download historical share prices
HIST_PRICE_DF = pd.DataFrame()

def Prices_Extract(TickName):
    
    global HIST_PRICE_DF

    try:
        tick = yf.Ticker(TickName) 
        
        hist_p = tick.history(period="6mo")
        hist_p2 = hist_p.reset_index()
        hist_p2.rename(columns = {'index':'Metric'}, inplace = True)
        
        hist_p2['TickName']=TickName

        HIST_PRICE_DF = HIST_PRICE_DF.append(hist_p2, ignore_index=True)
        
            
        print("finished Price:",TickName)
        
    except Exception:
        print("Error with Price:",TickName)
        pass

    return()


# for company in TICKER_ALL:
    # Prices_Extract(company)

# HIST_PRICE_DF.to_csv('/Users/joezhou/Downloads/R_ALL_Prices.csv', sep='|', index=False)
# HIST_PRICE_DF.to_pickle("/Users/joezhou/Downloads/R_ALL_Prices.pkl")

# HIST_PRICE_DF_1.to_csv('/Users/joezhou/Downloads/R_ALL_Prices_1.csv', sep='|', index=False)
# HIST_PRICE_DF_2.to_csv('/Users/joezhou/Downloads/R_ALL_Prices_2.csv', sep='|', index=False)
# HIST_PRICE_DF_3.to_csv('/Users/joezhou/Downloads/R_ALL_Prices_3.csv', sep='|', index=False)
# HIST_PRICE_DF_4.to_csv('/Users/joezhou/Downloads/R_ALL_Prices_4.csv', sep='|', index=False)

In [None]:
#Part 4: historical dividend information

#DIV = tick.dividends  


HIST_DIV_DF = pd.DataFrame()

def Dividend_Extract(TickName):
    global HIST_DIV_DF
    try:
        tick = yf.Ticker(TickName) 
        
        #income statement
        DIV_OG = tick.dividends
        DIV_OG2 = DIV_OG.reset_index()
        DIV_OG2.rename(columns = {'index':'Metric'}, inplace = True)
        DIV_OG2['TickName']=TickName
        
        HIST_DIV_DF = HIST_DIV_DF.append(DIV_OG2, ignore_index=True)
        
            
        print("finished Dividend",TickName)
        
    except Exception:
        print("Error with Dividend",TickName)

        pass

    return()

for company in TICKER_ALL:
   Dividend_Extract(company)

HIST_DIV_DF.to_csv('/Users/joezhou/Downloads/R_ALL_Div.csv', sep='|', index=False) 



In [None]:
#Inject data into SQL lite database or some kind of clouud database

In [None]:
#Insert data
#DF_SHORT.to_sql('test', db_sql, if_exists='replace')

#pd.read_sql('select * from test', db_sql)

In [None]:
# pip install yfinance==0.1.62

# ticker = yf.Ticker('AAPL')
# aapl_df = ticker.history(period="1y")
# aapl_df['Close'].plot(title="APPLE's stock price")

# aapl_df = yf.download('AAPL', 
                    #   start='2022-01-01', 
                    #   end='2022-03-16', 
                    #   progress=False,
# )
# aapl_df.head()