#### Read the adjusted OHLCV data for the ticker csv files in Yloader directory, write the OHLCV data to a multilevel index dataframe, save the dataframe in data directory

In [1]:
# --- SCRIPT CONFIGURATION ---
# Define the directory where YOUR CSV files are located
data_dir = 'C:/Users/ping/Desktop/yloader' # <--- SET YOUR ACTUAL DATA DIRECTORY HERE
# For testing with a different directory, you can modify the line above, or temporarily use a line like:
# data_dir = 'C:/Users/ping/Desktop/test' # <--- SET YOUR TEST DATA DIRECTORY HERE

# Define the destination path for the output Parquet file
DEST_PATH = 'C:/Users/ping/Files_win10/python/py311/stocks/data/df_OHLCV_stocks_etfs.parquet'

In [2]:
import pandas as pd
import os
# import numpy as np # Only needed if you were to use np.nan explicitly somewhere

# Define the canonical column names for the data columns in your CSV files.
# It is assumed that:
# 1. CSV files do NOT have a header row.
# 2. The FIRST column in each CSV is 'Date'.
# 3. The columns FOLLOWING 'Date' correspond, in order, to the names in this list.
CANONICAL_COLUMN_NAMES = ['Adj Open', 'Adj High', 'Adj Low', 'Adj Close', 'Volume']

# --- CORE LOGIC ---
def process_csv_files(data_dir_path, canonical_data_column_names):
    """
    Processes multiple CSV files from a specified directory, combining them into a single DataFrame.

    Each CSV file is expected to:
    - Not have a header row.
    - Have its first column containing dates (named 'Date' internally).
    - Have subsequent columns matching the `canonical_data_column_names` in order and count.

    The function extracts a ticker symbol from each filename (e.g., "TICKER.csv" -> "TICKER").
    It then concatenates all data into a pandas DataFrame with a MultiIndex ('Ticker', 'Date').
    The final DataFrame is sorted primarily by 'Ticker' (alphabetically ascending, A-Z),
    and then, for each ticker, by 'Date' (newest first).

    Args:
        data_dir_path (str): The path to the directory containing the CSV files.
        canonical_data_column_names (list of str): A list of column names for the data
                                                   fields, appearing after the 'Date' column.

    Returns:
        pandas.DataFrame or None: A MultiIndex DataFrame containing all processed data,
                                  sorted as described. Returns None if no data is
                                  processed or a critical error occurs (e.g., directory not found).
    """
    all_dataframes = []
    tickers_list = []

    # Check if the directory exists
    if not os.path.isdir(data_dir_path):
        print(f"Error: Directory not found: {data_dir_path}")
        return None

    # Get all CSV files in the directory
    csv_files = [f for f in os.listdir(data_dir_path) 
                 if f.endswith('.csv') and os.path.isfile(os.path.join(data_dir_path, f))]

    if not csv_files:
        print(f"No CSV files found in directory: {data_dir_path}")
        return None
    
    print(f"Found CSV files: {csv_files}")

    # Define the complete list of column names expected in the CSV files.
    # The first column is 'Date', followed by the canonical data columns.
    expected_csv_column_names = ['Date'] + canonical_data_column_names

    for filename in csv_files:
        ticker = filename.replace('.csv', '') # Extract ticker from filename
        file_path = os.path.join(data_dir_path, filename)
        
        try:
            # Read the CSV file:
            df_temp = pd.read_csv(
                file_path,
                header=None,
                names=expected_csv_column_names,
                parse_dates=['Date'] 
            )
            
            df_temp.set_index('Date', inplace=True)
            
            if not isinstance(df_temp.index, pd.DatetimeIndex):
                print(f"Warning: 'Date' column in {filename} could not be parsed as dates. Skipping {ticker}.")
                continue

            if list(df_temp.columns) != canonical_data_column_names:
                print(f"Warning: Columns in {filename} ({list(df_temp.columns)}) "
                      f"do not perfectly match or align with CANONICAL_COLUMN_NAMES ({canonical_data_column_names}) "
                      f"after initial load. Reindexing to conform.")
                df_temp = df_temp.reindex(columns=canonical_data_column_names)

            all_dataframes.append(df_temp)
            tickers_list.append(ticker)
            print(f"Successfully processed: {filename}")
            
        except pd.errors.EmptyDataError:
            print(f"Warning: File {filename} is empty or contains no data. Skipping.")
        except pd.errors.ParserError as pe:
            print(f"Error parsing file {filename}: {pe}. "
                  f"Ensure it has exactly {len(expected_csv_column_names)} columns and valid data. Skipping.")
        except Exception as e:
            print(f"An unexpected error occurred while processing file {filename}: {e}. Skipping.")

    if not all_dataframes:
        print("No dataframes were successfully loaded or concatenated.")
        return None

    multi_index_df = pd.concat(all_dataframes, keys=tickers_list, names=['Ticker', 'Date'])
    
    # Sort the final DataFrame.
    # Primary sort key: 'Ticker' (alphabetical ascending, A-Z).
    # Secondary sort key: 'Date' (descending, newest dates first for each ticker).
    # This matches the indexing pattern shown in the example:
    # ('MDT', Timestamp('YYYY-MM-DD')) to ('SNAP', Timestamp('YYYY-MM-DD'))
    multi_index_df.sort_index(level=['Ticker', 'Date'], ascending=[True, False], inplace=True)
    
    if multi_index_df.columns.has_duplicates:
        print("\nWarning: Duplicate columns detected in the final DataFrame. Merging them using 'max' strategy...")
        multi_index_df = multi_index_df.groupby(level=0, axis=1).max()
        print("Duplicate columns merged.")
        
    return multi_index_df


In [3]:
print("Starting CSV processing...")
final_df = process_csv_files(data_dir, CANONICAL_COLUMN_NAMES)

if final_df is not None:
    print("\n--- MultiIndex DataFrame (First 5 Rows) ---")
    print(final_df.head()) 

    print("\n--- DataFrame Info ---")
    final_df.info() 

    if not final_df.empty:
        example_ticker_to_show = 'NVDA' 
        available_tickers = final_df.index.get_level_values('Ticker').unique()
        
        if example_ticker_to_show not in available_tickers:
            if len(available_tickers) > 0:
                example_ticker_to_show = available_tickers[0]
            else:
                example_ticker_to_show = None 
        
        if example_ticker_to_show:
            print(f"\n--- First 5 rows of '{example_ticker_to_show}' data (sorted by Date descending within Ticker) ---")
            print(final_df.loc[example_ticker_to_show].head())
        else:
            print("\nNo ticker data available in the final DataFrame to display as an example.")
else:
    print("CSV processing finished, but no data was loaded into the final DataFrame.")

Starting CSV processing...
Found CSV files: ['A.csv', 'AA.csv', 'AAL.csv', 'AAON.csv', 'AAPL.csv', 'ABBV.csv', 'ABEV.csv', 'ABNB.csv', 'ABT.csv', 'ACGL.csv', 'ACI.csv', 'ACIW.csv', 'ACM.csv', 'ACN.csv', 'ACWI.csv', 'ACWV.csv', 'ACWX.csv', 'ADBE.csv', 'ADC.csv', 'ADI.csv', 'ADM.csv', 'ADMA.csv', 'ADP.csv', 'ADSK.csv', 'ADT.csv', 'AEE.csv', 'AEG.csv', 'AEM.csv', 'AEP.csv', 'AER.csv', 'AES.csv', 'AFG.csv', 'AFL.csv', 'AFRM.csv', 'AGCO.csv', 'AGG.csv', 'AGI.csv', 'AGNC.csv', 'AIG.csv', 'AIRR.csv', 'AIT.csv', 'AIZ.csv', 'AJG.csv', 'AKAM.csv', 'AL.csv', 'ALAB.csv', 'ALB.csv', 'ALC.csv', 'ALGN.csv', 'ALK.csv', 'ALL.csv', 'ALLE.csv', 'ALLY.csv', 'ALNY.csv', 'ALSN.csv', 'ALV.csv', 'AM.csv', 'AMAT.csv', 'AMCR.csv', 'AMD.csv', 'AME.csv', 'AMGN.csv', 'AMH.csv', 'AMLP.csv', 'AMP.csv', 'AMT.csv', 'AMX.csv', 'AMZN.csv', 'AN.csv', 'ANET.csv', 'ANSS.csv', 'AON.csv', 'AOS.csv', 'APA.csv', 'APD.csv', 'APG.csv', 'APH.csv', 'APO.csv', 'APP.csv', 'APPF.csv', 'APTV.csv', 'AR.csv', 'ARCC.csv', 'ARE.csv', 'ARE

In [4]:
print(f'ticker to show: {example_ticker_to_show}')
final_df.loc[example_ticker_to_show]

ticker to show: NVDA


Unnamed: 0_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-05-08,118.4800,118.6800,115.8500,117.3700,195705150
2025-05-07,113.0500,117.6800,112.2800,117.0600,207827800
2025-05-06,111.4800,114.7400,110.8200,113.5400,158525600
2025-05-05,112.9100,114.6700,112.6600,113.8200,133163200
2025-05-02,114.1800,115.4000,113.3700,114.5000,190194800
...,...,...,...,...,...
2024-01-08,49.4934,52.2554,49.4604,52.2334,642751375
2024-01-05,48.4438,49.5284,48.2879,49.0786,415194877
2024-01-04,47.7491,48.4818,47.4902,47.9800,306650106
2024-01-03,47.4672,48.1659,47.3022,47.5511,321016529


In [5]:
final_df.to_parquet(DEST_PATH, engine='pyarrow', compression='zstd', index=True)
print(f"OHLCV data saved to: {DEST_PATH}")

OHLCV data saved to: C:/Users/ping/Files_win10/python/py311/stocks/data/df_OHLCV_stocks_etfs.parquet


In [None]:
# _df = pd.read_parquet(DEST_PATH)
# print(_df.head())