In [1]:
import yfinance as yf
import pandas as pd
import time

# Load the CSV file (specifying the separator)
df = pd.read_csv('ASX_200_companies.csv', sep=';')

# Extract the tickers and company names
tickers = df['Index'].tolist()  # Getting the tickers in column 'Index'
company_names = df['Company name'].tolist()  # Getting the company names

# Create a dictionary to map tickers to company names
ticker_to_name = dict(zip(tickers, company_names))

# List to store tuples of (ticker, company name) for companies with download failures
failed_downloads = []

# Loop through each ticker and download its stock data
for ticker in tickers:
    attempt = 0
    while attempt < 3:  # Limit to 3 attempts
        try:
            # Download stock data from Yahoo Finance for each ticker
            stock_data = yf.download(ticker + '.AX', start='2020-01-01', end='2025-01-01')  # Use '.AX' for ASX tickers
            
            # If the stock data is empty, skip it
            if stock_data.empty:
                print(f"No data found for {ticker} ({ticker_to_name[ticker]}), skipping.")
                failed_downloads.append((ticker, ticker_to_name[ticker]))  # Add ticker and company name to failures
                break  # Exit the retry loop
            
            # Save the data to a CSV file named after the ticker
            stock_data.to_csv(f'{ticker}_data.csv')
            
            print(f"Downloaded and saved data for {ticker} ({ticker_to_name[ticker]})")
            break  # Successful download, exit retry loop

        except Exception as e:
            attempt += 1
            print(f"Attempt {attempt} for {ticker} failed: {e}")
            if attempt < 3:
                print("Retrying...")
                time.sleep(3)  # Wait before retrying
            
            if attempt == 3:  # After 3 failed attempts
                failed_downloads.append((ticker, ticker_to_name[ticker]))  # Record failure


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CBA (Commonwealth Bank)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BHP (BHP)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CSL (CSL)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for WBC (Westpac)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ANZ (Australia & New Zealand Banking Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for FMG (Fortescue)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for NAB (National Australia Bank)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for MQG (Macquarie Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for GMG (Goodman Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for WOW (Woolworths)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for WES (Wesfarmers)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for TLS (Telstra)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for RIO (Rio Tinto)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for WDS (Woodside Energy)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for TCL (Transurban)


404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/SQ2.AX?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=SQ2.AX&crumb=fMYVHGriFW4
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SQ2.AX']: AttributeError("'NoneType' object has no attribute 'update'")


No data found for SQ2 (Block Inc. CDI), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ALL (Aristocrat Leisure)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for COL (Coles Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SCG (Scentre Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for S32 (South32)


404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/NCM.AX?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=NCM.AX&crumb=fMYVHGriFW4
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['NCM.AX']: AttributeError("'NoneType' object has no attribute 'update'")


No data found for NCM (Newcrest), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SUN (Suncorp)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for QBE (QBE Insurance)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BXB (Brambles)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for FPH (Fisher & Paykel Healthcare)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for COH (Cochlear)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ASX (ASX)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for STO (Santos)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for RHC (Ramsay Health Care)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for AMC (Amcor)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ORG (Origin Energy)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for IAG (Insurance Australia Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SHL (Sonic Healthcare)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for DXS (Dexus)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Downloaded and saved data for BSL (BlueScope)





Downloaded and saved data for APA (APA Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for TWE (Treasury Wine Estates)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for REA (REA Group)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AIA.AX']: YFPricesMissingError('possibly delisted; no price data found  (1d 2020-01-01 -> 2025-01-01)')


No data found for AIA (Auckland Airport), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for DMP (Domino's Pizza Enterprises)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CPU (Computershare)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for TAH (Tabcorp)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for VCX (Vicinity Centres)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for MGR (Mirvac)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for QAN (Qantas)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for GPT (GPT Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for EVN (Evolution Mining)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SGP (Stockland)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['BLD.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for BLD (Boral), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for MPL (Medibank)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for JHX (James Hardie)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for LLC (Lendlease)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SOL (Soul Patts)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ALD (Ampol)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for WOR (Worley)
Downloaded and saved data for AZJ (Aurizon)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CHC (Charter Hall)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ORI (Orica)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for HVN (Harvey Norman)
Downloaded and saved data for XRO (Xero)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SVW.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for SVW (Seven Group Holdings), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SEK (Seek)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for TPG (TPG Telecom)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for MFG (Magellan Financial Group)
Downloaded and saved data for IEL (IDP Education)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for FBU (Fletcher Building)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ALQ (ALS)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ALX (Atlas Arteria)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for NST (Northern Star Resources)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BEN (Bendigo & Adelaide Bank)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for WTC (Wisetech Global)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BOQ (Bank of Queensland)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for RMD (ResMed)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for IGO (Independence Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CWY (Cleanaway)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for AGL (AGL Energy)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ANN (Ansell)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CAR (CAR Group)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AWC.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for AWC (Alumina), skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for VEA (Viva Energy)
Downloaded and saved data for IPL (Incitec Pivot)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for WHC (Whitehaven Coal)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for QUB (Qube Holdings)
Downloaded and saved data for A2M (a2 Milk Company)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for APE (Eagers Automotive)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SGR (Star Entertainment Group)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['LNK.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for LNK (Link Admin), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for DOW (Downer Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BRG (Breville Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for AMP (AMP)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ORA (Orora)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CGF (Challenger)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ALU.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for ALU (Altium), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for RWC (Reliance Worldwide Corporation)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ARB (ARB Corporation)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CIA (Champion Iron)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['OZL.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for OZL (Oz Minerals), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ILU (Iluka Resources)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for FLT (Flight Centre)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BKW (Brickworks)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BPT (Beach Energy)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for GOZ (Growthpoint Properties)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CTD (Corporate Travel Management)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CLW (Charter Hall Long WALE REIT)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['VUK.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for VUK (Virgin Money UK), skipping.


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['CSR.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for CSR (CSR), skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for NEC (Nine Entertainment)
Downloaded and saved data for DHG (Domain Group)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Downloaded and saved data for MIN (Mineral Resources)





Downloaded and saved data for BWP (BWP Trust)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BAP (Bapcor)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['PDL.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for PDL (Pendal Group), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for RRL (Regis Resources)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for HLS (Healius)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for IFL (Insignia Financial)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for PME (Pro Medicus)
Downloaded and saved data for NHF (Nib)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for PMV (Premier Investments)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ABP.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for ABP (Abacus Property Group), skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for JBH (JB Hi-Fi)
Downloaded and saved data for SDF (Steadfast Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for MTS (Metcash)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CMW (Cromwell Property Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SBM (St Barbara)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for TNE (TechnologyOne)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SGM (Sims Metal)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for DRR (Deterra Royalties)
Downloaded and saved data for SCP (Shopping Centres Australasia Property Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CQR (Charter Hall Retail REIT)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CNU (Chorus)
Downloaded and saved data for NXT (NextDC)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['JHG.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for JHG (Janus Henderson), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for WEB (Webjet)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ABC.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for ABC (Adbri), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for IRE (Iress)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CCP (Credit Corp Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for NUF (Nufarm)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CIP (Centuria Industrial REIT)
Downloaded and saved data for NWL (Netwealth Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for INA (Ingenia Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for PPT (Perpetual)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for HUB (HUB24)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BGA (Bega Cheese)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for WPR (Waypoint REIT[17])


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ELD (Elders)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['IVC.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for IVC (InvoCare), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for AUB (AUB Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CKF (Collins Foods)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for IPH (IPH)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SUL (Super Retail Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CUV (Clinuvel Pharmaceut)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ING (Inghams)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['SLR.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for SLR (Silver Lake Resource), skipping.


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['CGC.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for CGC (Costa Group), skipping.


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['BKL.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for BKL (Blackmores), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for PLS (Pilbara Minerals)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for GOR (Gold Road Res)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for NAN (Nanosonics)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for EML (EML Payments)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for GNC (GrainCorp)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for NSR (National Storage)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for LYC (Lynas)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['GUD.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for GUD (GUD Holdings), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for NWS (News Corp Class B)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for KLS (Kelsian Group)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AKE.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for AKE (Allkem), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for ARF (Arena REIT)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AVZ.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for AVZ (AVZ Minerals), skipping.


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for BRN (Brainchip)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CCX (City Chic Collective)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CHN (Chalice Mining)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CNI (Centuria Capital)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for CRN (Coronado Global Resources)
Downloaded and saved data for CXO (Core Lithium)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for DEG (De Grey Mining)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for EDV (Endeavour Group)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for EVT (Event Hospitality and Entertainment)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for HDN (HomeCo Daily Needs REIT)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for HMC (Home Consortium)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for IMU (Imugene)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for LIC (Lifestyle Communities)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for LKE (Lake Resources)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for LTR (Liontown Resources)
Downloaded and saved data for MP1 (Megaport)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['NHC.AX']: YFPricesMissingError('possibly delisted; no price data found  (1d 2020-01-01 -> 2025-01-01)')


No data found for NHC (New Hope), skipping.


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for NIC (Nickel Industries Limited)
Downloaded and saved data for NVX (Novonix)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for PBH (Pointsbet Holdings)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for PDN (Paladin Energy)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for PNI (Pinnacle Investment Management)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for PRU (Perseus Mining)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Downloaded and saved data for REH (Reece Group)





Downloaded and saved data for RMS (Ramelius Resources)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for SFR (Sandfire Resources)


[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for TLX (Telix Pharmaceuticals)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded and saved data for TLC (The Lottery Corporation)
Downloaded and saved data for TYR (Tyro Payments)


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UMG.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for UMG (United Malt Group), skipping.


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['UWL.AX']: YFTzMissingError('possibly delisted; no timezone found')


No data found for UWL (Uniti Group), skipping.


[*********************100%***********************]  1 of 1 completed

Downloaded and saved data for ZIP (Zip)





In [2]:
# Print out the list of failed downloads with ticker names
if failed_downloads:
    print("\nCompanies with failed downloads:")
    for ticker, company in failed_downloads:
        print(f"Ticker: {ticker}, Company: {company}")
else:
    print("\nAll downloads were successful.")


Companies with failed downloads:
Ticker: SQ2, Company: Block Inc. CDI
Ticker: NCM, Company: Newcrest
Ticker: AIA, Company: Auckland Airport
Ticker: BLD, Company: Boral
Ticker: SVW, Company: Seven Group Holdings
Ticker: AWC, Company: Alumina
Ticker: LNK, Company: Link Admin
Ticker: ALU, Company: Altium
Ticker: OZL, Company: Oz Minerals
Ticker: VUK, Company: Virgin Money UK
Ticker: CSR, Company: CSR
Ticker: PDL, Company: Pendal Group
Ticker: ABP, Company: Abacus Property Group
Ticker: JHG, Company: Janus Henderson
Ticker: ABC, Company: Adbri
Ticker: IVC, Company: InvoCare
Ticker: SLR, Company: Silver Lake Resource
Ticker: CGC, Company: Costa Group
Ticker: BKL, Company: Blackmores
Ticker: GUD, Company: GUD Holdings
Ticker: AKE, Company: Allkem
Ticker: AVZ, Company: AVZ Minerals
Ticker: NHC, Company: New Hope
Ticker: UMG, Company: United Malt Group
Ticker: UWL, Company: Uniti Group


In [3]:
import pandas as pd
import os

# Load the CSV file with company tickers and names
df = pd.read_csv('ASX_200_companies.csv', sep=';')

# Extract the tickers and company names
tickers = df['Index'].tolist()  # Getting the tickers in column 'Index'
company_names = df['Company name'].tolist()  # Getting the company names

# Create a dictionary to map tickers to company names
ticker_to_name = dict(zip(tickers, company_names))

# List to store successful data
combined_data = []

# Loop through each ticker to read the downloaded data
for ticker in tickers:
    try:
        # Construct the filename pattern
        file_name = f"{ticker}_data.csv"
        
        # Check if the file exists before trying to read
        if os.path.exists(file_name):
            # Read the CSV file
            stock_data = pd.read_csv(file_name, header=[0, 1], index_col=0)
            
            # Resetting the index to turn Date into a column
            stock_data.reset_index(inplace=True)

            # Check if the DataFrame is not empty
            if not stock_data.empty:
                # Flatten the MultiIndex columns
                stock_data.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
                
                # Add a new column with the company name
                stock_data['Name'] = ticker

                # Append data to the combined_data list
                combined_data.append(stock_data)
            else:
                print(f"Data for {ticker} is empty; skipping.")
        else:
            print(f"File {file_name} not found; skipping.")
        
    except Exception as e:
        print(f"Error processing {ticker}: {e}")

# Combine all DataFrames into one
if combined_data:  # Ensure there's data to concatenate
    combined_df = pd.concat(combined_data, ignore_index=True)
    
    # Reorder columns to match the required output
    combined_df = combined_df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Name']]
    
    # Save the combined DataFrame to a CSV file
    combined_df.to_csv('combined_stock_data.csv', index=False)
    print("Combined data has been saved to 'combined_stock_data.csv'.")
else:
    print("No data to combine; please check the downloaded files.")

File SQ2_data.csv not found; skipping.
File NCM_data.csv not found; skipping.
File BLD_data.csv not found; skipping.
File SVW_data.csv not found; skipping.
File AWC_data.csv not found; skipping.
File LNK_data.csv not found; skipping.
File ALU_data.csv not found; skipping.
File OZL_data.csv not found; skipping.
File VUK_data.csv not found; skipping.
File CSR_data.csv not found; skipping.
File PDL_data.csv not found; skipping.
File ABP_data.csv not found; skipping.
File JHG_data.csv not found; skipping.
File ABC_data.csv not found; skipping.
File IVC_data.csv not found; skipping.
File SLR_data.csv not found; skipping.
File CGC_data.csv not found; skipping.
File BKL_data.csv not found; skipping.
File GUD_data.csv not found; skipping.
File AKE_data.csv not found; skipping.
File AVZ_data.csv not found; skipping.
File UMG_data.csv not found; skipping.
File UWL_data.csv not found; skipping.
Combined data has been saved to 'combined_stock_data.csv'.


In [4]:
import pandas as pd

# Load ASX200 dataset
df = pd.read_csv('combined_stock_data.csv')  # Replace with the actual file name

In [5]:
print(df.head())

         Date       Open       High        Low      Close   Volume Name
0  2020-01-02  64.860162  65.209704  64.535007  64.933319  1416232  CBA
1  2020-01-03  65.819375  65.998211  65.234096  65.282867  1622784  CBA
2  2020-01-06  64.811385  64.933313  64.404942  64.843895  2129260  CBA
3  2020-01-07  65.697438  66.006332  65.177192  66.006332  2417468  CBA
4  2020-01-08  66.022597  66.046983  65.055261  65.762474  1719114  CBA


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220923 entries, 0 to 220922
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Date    220923 non-null  object 
 1   Open    220923 non-null  float64
 2   High    220923 non-null  float64
 3   Low     220923 non-null  float64
 4   Close   220923 non-null  float64
 5   Volume  220923 non-null  int64  
 6   Name    220923 non-null  object 
dtypes: float64(4), int64(1), object(2)
memory usage: 11.8+ MB
None


In [7]:
print(df.describe())

                Open           High            Low          Close  \
count  220923.000000  220923.000000  220923.000000  220923.000000   
mean       17.709865      17.909480      17.503726      17.707719   
std        35.224421      35.555256      34.874241      35.223498   
min         0.014000       0.015000       0.013000       0.014000   
25%         2.996701       3.035078       2.953293       2.994609   
50%         6.657022       6.740781       6.565000       6.651150   
75%        17.679931      17.887562      17.450001      17.657126   
max       344.693708     344.999009     339.346039     342.769043   

             Volume  
count  2.209230e+05  
mean   3.825871e+06  
std    7.303259e+06  
min    0.000000e+00  
25%    6.496065e+05  
50%    1.671070e+06  
75%    4.244162e+06  
max    3.677158e+08  


In [8]:
# Print rows with any missing values
rows_with_missing_values = df[df.isnull().any(axis=1)]

print("Rows with missing values:\n", rows_with_missing_values)

Rows with missing values:
 Empty DataFrame
Columns: [Date, Open, High, Low, Close, Volume, Name]
Index: []


In [9]:
# Create an empty list to store company data
summary_data = []

# Iterate through each unique company name
unique_companies = df['Name'].unique()

for company in unique_companies:
    # Filter dataframe for the specific company
    company_data = df[df['Name'] == company]

    # Find relevant minimum, maximum, opening, and closing values along with their dates
    lowest_price = company_data.loc[company_data['Close'].idxmin()]
    highest_price = company_data.loc[company_data['Close'].idxmax()]

    # Append data to the summary list
    summary_data.append({
        'Company Name': company,
        'Lowest Closing Price': lowest_price['Close'],
        'Date of Lowest Price': lowest_price['Date'],
        'Opening Price on Lowest Date': lowest_price['Open'],
        'Highest Closing Price': highest_price['Close'],
        'Date of Highest Price': highest_price['Date'],
        'Opening Price on Highest Date': highest_price['Open']
    })

# Create a DataFrame from the summary list
summary_df = pd.DataFrame(summary_data)

# Save the summary DataFrame to a CSV file
summary_df.to_csv('company_stock_summary.csv', index=False)
print("Summary table saved as 'company_stock_summary.csv'.")

Summary table saved as 'company_stock_summary.csv'.


In [10]:
pd.read_csv("company_stock_summary.csv")

Unnamed: 0,Company Name,Lowest Closing Price,Date of Lowest Price,Opening Price on Lowest Date,Highest Closing Price,Date of Highest Price,Opening Price on Highest Date
0,CBA,45.106667,2020-03-23,47.974673,158.900345,2024-12-17,156.415212
1,BHP,15.549701,2020-03-16,15.870567,47.116230,2023-12-28,46.688911
2,CSL,225.740524,2023-10-30,225.163086,319.165344,2020-02-19,313.437220
3,WBC,11.383221,2020-03-23,11.690002,33.830002,2024-11-22,33.700001
4,ANZ,10.721333,2020-03-23,11.367654,32.459999,2024-11-18,32.259998
...,...,...,...,...,...,...,...
172,SFR,2.370420,2020-03-23,2.463544,11.140000,2024-10-14,10.900000
173,TLX,0.800000,2020-03-23,0.810000,25.740000,2024-12-18,25.129999
174,TLC,3.724958,2022-10-12,3.791475,5.166000,2024-11-28,5.156160
175,TYR,0.600000,2022-06-30,0.630000,4.490000,2020-02-11,4.290000


In [11]:
df['Date'] = pd.to_datetime(df['Date'])
df['Ticker'] = df['Name'].astype('category')

In [12]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name,Ticker
0,2020-01-02,64.860162,65.209704,64.535007,64.933319,1416232,CBA,CBA
1,2020-01-03,65.819375,65.998211,65.234096,65.282867,1622784,CBA,CBA
2,2020-01-06,64.811385,64.933313,64.404942,64.843895,2129260,CBA,CBA
3,2020-01-07,65.697438,66.006332,65.177192,66.006332,2417468,CBA,CBA
4,2020-01-08,66.022597,66.046983,65.055261,65.762474,1719114,CBA,CBA
...,...,...,...,...,...,...,...,...
220918,2024-12-23,2.890000,3.050000,2.860000,2.960000,8861035,ZIP,ZIP
220919,2024-12-24,2.950000,2.960000,2.850000,2.910000,5828918,ZIP,ZIP
220920,2024-12-27,2.950000,3.060000,2.940000,3.060000,7311340,ZIP,ZIP
220921,2024-12-30,3.000000,3.100000,2.950000,3.000000,7740629,ZIP,ZIP


In [13]:
# Extract components of the date for potential insights
df['Day'] = df['Date'].dt.day               # Day of the month
df['Month'] = df['Date'].dt.month           # Month of the year
df['Year'] = df['Date'].dt.year             # Year extracted from date
df['DayOfWeek'] = df['Date'].dt.dayofweek   # Day of the week (0=Monday, 6=Sunday)
df['Quarter'] = df['Date'].dt.quarter       # Financial quarter (1 to 4)
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name,Ticker,Day,Month,Year,DayOfWeek,Quarter
0,2020-01-02,64.860162,65.209704,64.535007,64.933319,1416232,CBA,CBA,2,1,2020,3,1
1,2020-01-03,65.819375,65.998211,65.234096,65.282867,1622784,CBA,CBA,3,1,2020,4,1
2,2020-01-06,64.811385,64.933313,64.404942,64.843895,2129260,CBA,CBA,6,1,2020,0,1
3,2020-01-07,65.697438,66.006332,65.177192,66.006332,2417468,CBA,CBA,7,1,2020,1,1
4,2020-01-08,66.022597,66.046983,65.055261,65.762474,1719114,CBA,CBA,8,1,2020,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
220918,2024-12-23,2.890000,3.050000,2.860000,2.960000,8861035,ZIP,ZIP,23,12,2024,0,4
220919,2024-12-24,2.950000,2.960000,2.850000,2.910000,5828918,ZIP,ZIP,24,12,2024,1,4
220920,2024-12-27,2.950000,3.060000,2.940000,3.060000,7311340,ZIP,ZIP,27,12,2024,4,4
220921,2024-12-30,3.000000,3.100000,2.950000,3.000000,7740629,ZIP,ZIP,30,12,2024,0,4


In [14]:
# Create lag features to capture previous days' prices
df['Prev_Close'] = df['Close'].shift(1)     # Previous day's closing price
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name,Ticker,Day,Month,Year,DayOfWeek,Quarter,Prev_Close
0,2020-01-02,64.860162,65.209704,64.535007,64.933319,1416232,CBA,CBA,2,1,2020,3,1,
1,2020-01-03,65.819375,65.998211,65.234096,65.282867,1622784,CBA,CBA,3,1,2020,4,1,64.933319
2,2020-01-06,64.811385,64.933313,64.404942,64.843895,2129260,CBA,CBA,6,1,2020,0,1,65.282867
3,2020-01-07,65.697438,66.006332,65.177192,66.006332,2417468,CBA,CBA,7,1,2020,1,1,64.843895
4,2020-01-08,66.022597,66.046983,65.055261,65.762474,1719114,CBA,CBA,8,1,2020,2,1,66.006332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220918,2024-12-23,2.890000,3.050000,2.860000,2.960000,8861035,ZIP,ZIP,23,12,2024,0,4,2.830000
220919,2024-12-24,2.950000,2.960000,2.850000,2.910000,5828918,ZIP,ZIP,24,12,2024,1,4,2.960000
220920,2024-12-27,2.950000,3.060000,2.940000,3.060000,7311340,ZIP,ZIP,27,12,2024,4,4,2.910000
220921,2024-12-30,3.000000,3.100000,2.950000,3.000000,7740629,ZIP,ZIP,30,12,2024,0,4,3.060000


In [15]:
# Create moving averages to smooth out price trends
df['SMA_5'] = df['Close'].rolling(window=5).mean()       # 5-day Simple Moving Average
df['SMA_20'] = df['Close'].rolling(window=20).mean()     # 20-day Simple Moving Average
df['EMA_20'] = df['Close'].ewm(span=20, adjust=False).mean()  # 20-day Exponential Moving Average

df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name,Ticker,Day,Month,Year,DayOfWeek,Quarter,Prev_Close,SMA_5,SMA_20,EMA_20
0,2020-01-02,64.860162,65.209704,64.535007,64.933319,1416232,CBA,CBA,2,1,2020,3,1,,,,64.933319
1,2020-01-03,65.819375,65.998211,65.234096,65.282867,1622784,CBA,CBA,3,1,2020,4,1,64.933319,,,64.966609
2,2020-01-06,64.811385,64.933313,64.404942,64.843895,2129260,CBA,CBA,6,1,2020,0,1,65.282867,,,64.954922
3,2020-01-07,65.697438,66.006332,65.177192,66.006332,2417468,CBA,CBA,7,1,2020,1,1,64.843895,,,65.055057
4,2020-01-08,66.022597,66.046983,65.055261,65.762474,1719114,CBA,CBA,8,1,2020,2,1,66.006332,65.365778,,65.122430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220918,2024-12-23,2.890000,3.050000,2.860000,2.960000,8861035,ZIP,ZIP,23,12,2024,0,4,2.830000,2.932000,3.125,3.044340
220919,2024-12-24,2.950000,2.960000,2.850000,2.910000,5828918,ZIP,ZIP,24,12,2024,1,4,2.960000,2.932000,3.103,3.031546
220920,2024-12-27,2.950000,3.060000,2.940000,3.060000,7311340,ZIP,ZIP,27,12,2024,4,4,2.910000,2.920000,3.093,3.034256
220921,2024-12-30,3.000000,3.100000,2.950000,3.000000,7740629,ZIP,ZIP,30,12,2024,0,4,3.060000,2.952000,3.074,3.030993


In [16]:
# Calculate volatility indicators
df['Volatility'] = df['Close'].rolling(window=20).std()  # 20-day rolling standard deviation

# Calculate daily returns as a percentage change from the previous day
df['Daily_Return'] = df['Close'].pct_change()            # Daily return calculation

# One-hot encode the 'DayOfWeek' categorical feature to use in models
df = pd.get_dummies(df, columns=['DayOfWeek'], drop_first=True)  # Drop first to avoid dummy variable trap

# Create target variable for future prediction (next day's closing price)
df['Future_Close'] = df['Close'].shift(-1)               # Shift closing prices to create a target variable

# Create additional features by combining existing ones
df['Open_Close_Ratio'] = df['Open'] / df['Close']       # Ratio of opening to closing price

df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name,Ticker,Day,Month,...,SMA_20,EMA_20,Volatility,Daily_Return,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,Future_Close,Open_Close_Ratio
0,2020-01-02,64.860162,65.209704,64.535007,64.933319,1416232,CBA,CBA,2,1,...,,64.933319,,,False,False,True,False,65.282867,0.998873
1,2020-01-03,65.819375,65.998211,65.234096,65.282867,1622784,CBA,CBA,3,1,...,,64.966609,,0.005383,False,False,False,True,64.843895,1.008218
2,2020-01-06,64.811385,64.933313,64.404942,64.843895,2129260,CBA,CBA,6,1,...,,64.954922,,-0.006724,False,False,False,False,66.006332,0.999499
3,2020-01-07,65.697438,66.006332,65.177192,66.006332,2417468,CBA,CBA,7,1,...,,65.055057,,0.017927,True,False,False,False,65.762474,0.995320
4,2020-01-08,66.022597,66.046983,65.055261,65.762474,1719114,CBA,CBA,8,1,...,,65.122430,,-0.003694,False,True,False,False,66.242065,1.003955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220918,2024-12-23,2.890000,3.050000,2.860000,2.960000,8861035,ZIP,ZIP,23,12,...,3.125,3.044340,0.237010,0.045936,False,False,False,False,2.910000,0.976351
220919,2024-12-24,2.950000,2.960000,2.850000,2.910000,5828918,ZIP,ZIP,24,12,...,3.103,3.031546,0.235441,-0.016892,True,False,False,False,3.060000,1.013746
220920,2024-12-27,2.950000,3.060000,2.940000,3.060000,7311340,ZIP,ZIP,27,12,...,3.093,3.034256,0.232653,0.051546,False,False,False,True,3.000000,0.964052
220921,2024-12-30,3.000000,3.100000,2.950000,3.000000,7740629,ZIP,ZIP,30,12,...,3.074,3.030993,0.223310,-0.019608,False,False,False,False,2.960000,1.000000


In [17]:
print(df.dtypes)  # Check the data types of each column

Date                datetime64[ns]
Open                       float64
High                       float64
Low                        float64
Close                      float64
Volume                       int64
Name                        object
Ticker                    category
Day                          int32
Month                        int32
Year                         int32
Quarter                      int32
Prev_Close                 float64
SMA_5                      float64
SMA_20                     float64
EMA_20                     float64
Volatility                 float64
Daily_Return               float64
DayOfWeek_1                   bool
DayOfWeek_2                   bool
DayOfWeek_3                   bool
DayOfWeek_4                   bool
Future_Close               float64
Open_Close_Ratio           float64
dtype: object


In [18]:
# Select only the numeric columns to ensure correlation works
numeric_df = df.select_dtypes(include=['number'])  # Keeps only numeric columns
numeric_df

Unnamed: 0,Open,High,Low,Close,Volume,Day,Month,Year,Quarter,Prev_Close,SMA_5,SMA_20,EMA_20,Volatility,Daily_Return,Future_Close,Open_Close_Ratio
0,64.860162,65.209704,64.535007,64.933319,1416232,2,1,2020,1,,,,64.933319,,,65.282867,0.998873
1,65.819375,65.998211,65.234096,65.282867,1622784,3,1,2020,1,64.933319,,,64.966609,,0.005383,64.843895,1.008218
2,64.811385,64.933313,64.404942,64.843895,2129260,6,1,2020,1,65.282867,,,64.954922,,-0.006724,66.006332,0.999499
3,65.697438,66.006332,65.177192,66.006332,2417468,7,1,2020,1,64.843895,,,65.055057,,0.017927,65.762474,0.995320
4,66.022597,66.046983,65.055261,65.762474,1719114,8,1,2020,1,66.006332,65.365778,,65.122430,,-0.003694,66.242065,1.003955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220918,2.890000,3.050000,2.860000,2.960000,8861035,23,12,2024,4,2.830000,2.932000,3.125,3.044340,0.237010,0.045936,2.910000,0.976351
220919,2.950000,2.960000,2.850000,2.910000,5828918,24,12,2024,4,2.960000,2.932000,3.103,3.031546,0.235441,-0.016892,3.060000,1.013746
220920,2.950000,3.060000,2.940000,3.060000,7311340,27,12,2024,4,2.910000,2.920000,3.093,3.034256,0.232653,0.051546,3.000000,0.964052
220921,3.000000,3.100000,2.950000,3.000000,7740629,30,12,2024,4,3.060000,2.952000,3.074,3.030993,0.223310,-0.019608,2.960000,1.000000


In [19]:
# Calculate the correlation matrix on the numeric data
correlation_matrix = numeric_df.corr()
# Display correlations with the target variable 'Future_Close'
correlation_matrix

Unnamed: 0,Open,High,Low,Close,Volume,Day,Month,Year,Quarter,Prev_Close,SMA_5,SMA_20,EMA_20,Volatility,Daily_Return,Future_Close,Open_Close_Ratio
Open,1.0,0.999931,0.999934,0.999869,-0.156917,0.000339,0.01424,0.053386,0.014153,0.999018,0.998736,0.993241,0.99509,0.337021,0.000732,0.998807,-0.010213
High,0.999931,1.0,0.999895,0.999936,-0.157,0.000337,0.013944,0.052565,0.013806,0.99897,0.998731,0.993263,0.995108,0.337515,0.000855,0.998858,-0.01327
Low,0.999934,0.999895,1.0,0.99993,-0.156883,0.000326,0.014599,0.054167,0.014578,0.998956,0.998706,0.993165,0.995025,0.336536,0.000872,0.998849,-0.013456
Close,0.999869,0.999936,0.99993,1.0,-0.15694,0.000289,0.01426,0.053496,0.014179,0.998906,0.998698,0.993167,0.995025,0.337101,0.000983,0.998906,-0.016385
Volume,-0.156917,-0.157,-0.156883,-0.15694,1.0,0.020911,-0.014265,-0.05593,-0.024462,-0.156801,-0.15679,-0.156025,-0.156246,-0.059064,-0.000906,-0.156903,0.012233
Day,0.000339,0.000337,0.000326,0.000289,0.020911,1.0,-0.007355,-0.00338,-0.00194,1.1e-05,-9.2e-05,-0.000617,-0.000375,-0.001797,-0.006605,-5.2e-05,0.015116
Month,0.01424,0.013944,0.014599,0.01426,-0.014265,-0.007355,1.0,-0.003244,0.971421,0.013808,0.013331,0.010023,0.009963,-0.082851,-0.006014,0.014149,-0.006538
Year,0.053386,0.052565,0.054167,0.053496,-0.05593,-0.00338,-0.003244,1.0,-0.000747,0.053184,0.052921,0.050597,0.050467,-0.074314,-0.006549,0.053266,-0.005199
Quarter,0.014153,0.013806,0.014578,0.014179,-0.024462,-0.00194,0.971421,-0.000747,1.0,0.013723,0.013242,0.009809,0.009889,-0.072238,-0.004632,0.014169,-0.009548
Prev_Close,0.999018,0.99897,0.998956,0.998906,-0.156801,1.1e-05,0.013808,0.053184,0.013723,1.0,0.999346,0.994177,0.995921,0.339219,-0.002628,0.99783,-0.010694


In [20]:
print(correlation_matrix['Future_Close'].sort_values(ascending=False))  # Sort and display


Future_Close        1.000000
Close               0.998906
High                0.998858
Low                 0.998849
Open                0.998807
Prev_Close          0.997830
SMA_5               0.997616
EMA_20              0.993919
SMA_20              0.992044
Volatility          0.337151
Year                0.053266
Quarter             0.014169
Month               0.014149
Daily_Return        0.000999
Day                -0.000052
Open_Close_Ratio   -0.015923
Volume             -0.156903
Name: Future_Close, dtype: float64


In [21]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name,Ticker,Day,Month,...,SMA_20,EMA_20,Volatility,Daily_Return,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,Future_Close,Open_Close_Ratio
0,2020-01-02,64.860162,65.209704,64.535007,64.933319,1416232,CBA,CBA,2,1,...,,64.933319,,,False,False,True,False,65.282867,0.998873
1,2020-01-03,65.819375,65.998211,65.234096,65.282867,1622784,CBA,CBA,3,1,...,,64.966609,,0.005383,False,False,False,True,64.843895,1.008218
2,2020-01-06,64.811385,64.933313,64.404942,64.843895,2129260,CBA,CBA,6,1,...,,64.954922,,-0.006724,False,False,False,False,66.006332,0.999499
3,2020-01-07,65.697438,66.006332,65.177192,66.006332,2417468,CBA,CBA,7,1,...,,65.055057,,0.017927,True,False,False,False,65.762474,0.995320
4,2020-01-08,66.022597,66.046983,65.055261,65.762474,1719114,CBA,CBA,8,1,...,,65.122430,,-0.003694,False,True,False,False,66.242065,1.003955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220918,2024-12-23,2.890000,3.050000,2.860000,2.960000,8861035,ZIP,ZIP,23,12,...,3.125,3.044340,0.237010,0.045936,False,False,False,False,2.910000,0.976351
220919,2024-12-24,2.950000,2.960000,2.850000,2.910000,5828918,ZIP,ZIP,24,12,...,3.103,3.031546,0.235441,-0.016892,True,False,False,False,3.060000,1.013746
220920,2024-12-27,2.950000,3.060000,2.940000,3.060000,7311340,ZIP,ZIP,27,12,...,3.093,3.034256,0.232653,0.051546,False,False,False,True,3.000000,0.964052
220921,2024-12-30,3.000000,3.100000,2.950000,3.000000,7740629,ZIP,ZIP,30,12,...,3.074,3.030993,0.223310,-0.019608,False,False,False,False,2.960000,1.000000


In [24]:
import pandas as pd

# Assuming df is your DataFrame
rows_with_missing = df[df.isnull().any(axis=1)]

print("Rows with missing values:")
print(rows_with_missing)

Rows with missing values:
             Date       Open       High        Low      Close   Volume Name  \
0      2020-01-02  64.860162  65.209704  64.535007  64.933319  1416232  CBA   
1      2020-01-03  65.819375  65.998211  65.234096  65.282867  1622784  CBA   
2      2020-01-06  64.811385  64.933313  64.404942  64.843895  2129260  CBA   
3      2020-01-07  65.697438  66.006332  65.177192  66.006332  2417468  CBA   
4      2020-01-08  66.022597  66.046983  65.055261  65.762474  1719114  CBA   
5      2020-01-09  66.250196  66.526574  65.981942  66.242065  3014295  CBA   
6      2020-01-10  66.355882  67.063095  66.335565  67.063095  2875353  CBA   
7      2020-01-13  66.737913  67.128101  66.437150  67.046814  1434635  CBA   
8      2020-01-14  67.437007  67.892221  67.282556  67.607712  2703855  CBA   
9      2020-01-15  67.550815  68.095447  67.550815  68.006035  2039328  CBA   
10     2020-01-16  68.282407  68.656334  68.184859  68.656334  3058484  CBA   
11     2020-01-17  68.8839

In [25]:
import pandas as pd

# Assuming df is your DataFrame
df_cleaned = df.dropna()

# Optionally, you can reset the index of the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)

print("DataFrame after dropping rows with missing values:")
print(df_cleaned)

DataFrame after dropping rows with missing values:
             Date       Open       High        Low      Close    Volume Name  \
0      2020-01-30  68.965228  69.452965  68.835170  69.452965   3176161  CBA   
1      2020-01-31  69.558656  69.883812  69.014024  69.306664   6713764  CBA   
2      2020-02-03  68.404344  69.054649  68.331181  68.729500   2139581  CBA   
3      2020-02-04  68.607572  68.843309  68.253962  68.534409   2564013  CBA   
4      2020-02-05  69.095293  69.095293  67.697128  67.697128   3166202  CBA   
...           ...        ...        ...        ...        ...       ...  ...   
220898 2024-12-20   2.860000   2.920000   2.800000   2.830000  16979823  ZIP   
220899 2024-12-23   2.890000   3.050000   2.860000   2.960000   8861035  ZIP   
220900 2024-12-24   2.950000   2.960000   2.850000   2.910000   5828918  ZIP   
220901 2024-12-27   2.950000   3.060000   2.940000   3.060000   7311340  ZIP   
220902 2024-12-30   3.000000   3.100000   2.950000   3.000000   77406

In [28]:
import pandas as pd

# Assuming df is your DataFrame
missing_cols = df_cleaned.columns[df_cleaned.isnull().any()].tolist()

print("Columns with missing values:", missing_cols)

Columns with missing values: []


In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Assuming df is your original DataFrame
# Step 1: Create a copy of df
dff = df.copy()

# Step 2: Standardize selected features
scaler = StandardScaler()
features_to_scale = ['Open', 'High', 'Low', 'Close', 'Volume', 'Volatility']
scaled_features = scaler.fit_transform(dff[features_to_scale])

# Create a new DataFrame for scaled features
scaled_df = pd.DataFrame(scaled_features, columns=features_to_scale)

# Step 3: Create lagged features for 'Close'
# Adding lagged features to the copied DataFrame (dff)
dff['Close_Lag1'] = dff['Close'].shift(1)  # Lag-1
dff['Close_Lag2'] = dff['Close'].shift(2)  # Lag-2

# Save to a new variable
lagged_df = dff.copy()

# Step 4: Create polynomial features for 'Volatility' (2nd degree polynomial)
poly = PolynomialFeatures(degree=2, include_bias=False)
volatility_poly = poly.fit_transform(dff[['Volatility']])
volatility_poly_df = pd.DataFrame(volatility_poly, columns=['Volatility', 'Volatility^2'])

# Combine the polynomial features with the new DataFrame
polynomial_df = dff.join(volatility_poly_df)

# Step 5: Apply log transformation to 'Volume'
dff['Volume_Log'] = np.log1p(dff['Volume'])  # log1p handles zero values safely

# Save the transformed DataFrame after log transformation
log_transformed_df = dff.copy()

# Print transformed DataFrames for verification
print("Scaled Features DataFrame (scaled_df):")
print(scaled_df)

print("\nLagged Features DataFrame (lagged_df):")
print(lagged_df)

print("\nPolynomial Features DataFrame (polynomial_df):")
print(polynomial_df)

print("\nLog Transformed DataFrame (log_transformed_df):")
print(log_transformed_df)

ValueError: Input X contains NaN.
PolynomialFeatures does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values