In [13]:
# import sys
# from pathlib import Path
# import pandas as pd
# import os
# from IPython.display import display, Markdown  # Assuming you use these for display


# # Set pandas display options to show more columns and rows
# pd.set_option('display.max_columns', None)  # Show all columns
# # pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
# pd.set_option('display.width', 2000) 


# # Notebook cell
# %load_ext autoreload
# %autoreload 2

# # Get root directory (assuming notebook is in root/notebooks/)
# NOTEBOOK_DIR = Path.cwd()
# ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# # Add src directory to Python path
# sys.path.append(str(ROOT_DIR / 'src'))

# # Verify path
# print(f"Python will look in these locations:\n{sys.path}")


# # --- Execute the processor ---
# import utils


# SOURCE_PATH, _, params_list = utils.main_processor(
#     # data_dir='.\\',  # search project ..\data
#     data_dir='..\data',  # search project ..\data
#     downloads_dir='',  # None searchs Downloads dir, '' omits search1
#     downloads_limit=5,  # search the first 10 files
#     clean_name_override=None,  # override filename
#     start_file_pattern='', # search for files starting with 'df_'
#     contains_pattern='.parquet',  # search for files containing 'df_'
# )


In [None]:
DEST_PATH = 'C:/Users/ping/Files_win10/python/py311/stocks/data/df_OHLCV_stocks_etfs.parquet'

In [15]:
import pandas as pd
import os
# import numpy as np # Only needed if you were to use np.nan explicitly somewhere

# --- SCRIPT CONFIGURATION ---
# Define the directory where YOUR CSV files are located
data_dir = 'C:/Users/ping/Desktop/yloader' # <--- SET YOUR ACTUAL DATA DIRECTORY HERE
# For testing with a different directory, you can modify the line above, or temporarily use a line like:
data_dir = 'C:/Users/ping/Desktop/test' 

# Define the canonical column names for the data columns in your CSV files.
# It is assumed that:
# 1. CSV files do NOT have a header row.
# 2. The FIRST column in each CSV is 'Date'.
# 3. The columns FOLLOWING 'Date' correspond, in order, to the names in this list.
CANONICAL_COLUMN_NAMES = ['Adj Open', 'Adj High', 'Adj Low', 'Adj Close', 'Volume']

# --- CORE LOGIC ---
def process_csv_files(data_dir_path, canonical_data_column_names):
    """
    Processes multiple CSV files from a specified directory, combining them into a single DataFrame.

    Each CSV file is expected to:
    - Not have a header row.
    - Have its first column containing dates (named 'Date' internally).
    - Have subsequent columns matching the `canonical_data_column_names` in order and count.

    The function extracts a ticker symbol from each filename (e.g., "TICKER.csv" -> "TICKER").
    It then concatenates all data into a pandas DataFrame with a MultiIndex ('Ticker', 'Date').
    The final DataFrame is sorted primarily by 'Ticker' (alphabetically ascending, A-Z),
    and then, for each ticker, by 'Date' (newest first).

    Args:
        data_dir_path (str): The path to the directory containing the CSV files.
        canonical_data_column_names (list of str): A list of column names for the data
                                                   fields, appearing after the 'Date' column.

    Returns:
        pandas.DataFrame or None: A MultiIndex DataFrame containing all processed data,
                                  sorted as described. Returns None if no data is
                                  processed or a critical error occurs (e.g., directory not found).
    """
    all_dataframes = []
    tickers_list = []

    # Check if the directory exists
    if not os.path.isdir(data_dir_path):
        print(f"Error: Directory not found: {data_dir_path}")
        return None

    # Get all CSV files in the directory
    csv_files = [f for f in os.listdir(data_dir_path) 
                 if f.endswith('.csv') and os.path.isfile(os.path.join(data_dir_path, f))]

    if not csv_files:
        print(f"No CSV files found in directory: {data_dir_path}")
        return None
    
    print(f"Found CSV files: {csv_files}")

    # Define the complete list of column names expected in the CSV files.
    # The first column is 'Date', followed by the canonical data columns.
    expected_csv_column_names = ['Date'] + canonical_data_column_names

    for filename in csv_files:
        ticker = filename.replace('.csv', '') # Extract ticker from filename
        file_path = os.path.join(data_dir_path, filename)
        
        try:
            # Read the CSV file:
            df_temp = pd.read_csv(
                file_path,
                header=None,
                names=expected_csv_column_names,
                parse_dates=['Date'] 
            )
            
            df_temp.set_index('Date', inplace=True)
            
            if not isinstance(df_temp.index, pd.DatetimeIndex):
                print(f"Warning: 'Date' column in {filename} could not be parsed as dates. Skipping {ticker}.")
                continue

            if list(df_temp.columns) != canonical_data_column_names:
                print(f"Warning: Columns in {filename} ({list(df_temp.columns)}) "
                      f"do not perfectly match or align with CANONICAL_COLUMN_NAMES ({canonical_data_column_names}) "
                      f"after initial load. Reindexing to conform.")
                df_temp = df_temp.reindex(columns=canonical_data_column_names)

            all_dataframes.append(df_temp)
            tickers_list.append(ticker)
            print(f"Successfully processed: {filename}")
            
        except pd.errors.EmptyDataError:
            print(f"Warning: File {filename} is empty or contains no data. Skipping.")
        except pd.errors.ParserError as pe:
            print(f"Error parsing file {filename}: {pe}. "
                  f"Ensure it has exactly {len(expected_csv_column_names)} columns and valid data. Skipping.")
        except Exception as e:
            print(f"An unexpected error occurred while processing file {filename}: {e}. Skipping.")

    if not all_dataframes:
        print("No dataframes were successfully loaded or concatenated.")
        return None

    multi_index_df = pd.concat(all_dataframes, keys=tickers_list, names=['Ticker', 'Date'])
    
    # Sort the final DataFrame.
    # Primary sort key: 'Ticker' (alphabetical ascending, A-Z).
    # Secondary sort key: 'Date' (descending, newest dates first for each ticker).
    # This matches the indexing pattern shown in the example:
    # ('MDT', Timestamp('YYYY-MM-DD')) to ('SNAP', Timestamp('YYYY-MM-DD'))
    multi_index_df.sort_index(level=['Ticker', 'Date'], ascending=[True, False], inplace=True)
    
    if multi_index_df.columns.has_duplicates:
        print("\nWarning: Duplicate columns detected in the final DataFrame. Merging them using 'max' strategy...")
        multi_index_df = multi_index_df.groupby(level=0, axis=1).max()
        print("Duplicate columns merged.")
        
    return multi_index_df


In [16]:
print("Starting CSV processing...")
final_df = process_csv_files(data_dir, CANONICAL_COLUMN_NAMES)

if final_df is not None:
    print("\n--- MultiIndex DataFrame (First 5 Rows) ---")
    print(final_df.head()) 

    print("\n--- DataFrame Info ---")
    final_df.info() 

    if not final_df.empty:
        example_ticker_to_show = 'NVDA' 
        available_tickers = final_df.index.get_level_values('Ticker').unique()
        
        if example_ticker_to_show not in available_tickers:
            if len(available_tickers) > 0:
                example_ticker_to_show = available_tickers[0]
            else:
                example_ticker_to_show = None 
        
        if example_ticker_to_show:
            print(f"\n--- First 5 rows of '{example_ticker_to_show}' data (sorted by Date descending within Ticker) ---")
            print(final_df.loc[example_ticker_to_show].head())
        else:
            print("\nNo ticker data available in the final DataFrame to display as an example.")
else:
    print("CSV processing finished, but no data was loaded into the final DataFrame.")

Starting CSV processing...
Found CSV files: ['A.csv', 'AA.csv', 'AAL.csv']
Successfully processed: A.csv
Successfully processed: AA.csv
Successfully processed: AAL.csv

--- MultiIndex DataFrame (First 5 Rows) ---
                   Adj Open  Adj High  Adj Low  Adj Close   Volume
Ticker Date                                                       
A      2025-05-07    106.69    107.60   104.79     107.52  2143700
       2025-05-06    107.25    108.21   104.36     105.24  1960600
       2025-05-05    108.10    109.25   107.46     108.37  1385500
       2025-05-02    109.24    110.33   107.45     108.63  1213100
       2025-05-01    107.25    108.23   104.10     106.46  1523100

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 753 entries, ('A', Timestamp('2025-05-07 00:00:00')) to ('AAL', Timestamp('2024-05-07 00:00:00'))
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Adj Open   753 non-null    fl

In [17]:
print(f'ticker to show: {example_ticker_to_show}')
final_df.loc[example_ticker_to_show]

ticker to show: A


Unnamed: 0_level_0,Adj Open,Adj High,Adj Low,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-05-07,106.690,107.600,104.790,107.520,2143700
2025-05-06,107.250,108.210,104.360,105.240,1960600
2025-05-05,108.100,109.250,107.460,108.370,1385500
2025-05-02,109.240,110.330,107.450,108.630,1213100
2025-05-01,107.250,108.230,104.100,106.460,1523100
...,...,...,...,...,...
2024-05-13,148.426,149.150,146.093,146.669,1327505
2024-05-10,144.812,151.681,144.048,148.654,2263418
2024-05-09,142.053,144.405,141.765,144.068,1263331
2024-05-08,140.078,141.944,139.006,141.745,1434092


In [19]:
final_df.to_parquet(DEST_PATH, index=True)
print(f"OHLCV data saved to: {DEST_PATH}")

OHLCV data saved to: C:/Users/ping/Files_win10/python/py311/stocks/data/df_OHLCV.parquet


In [None]:
# _df = pd.read_parquet(DEST_PATH)
# print(_df.head())

                   Adj Open  Adj High  Adj Low  Adj Close   Volume
Ticker Date                                                       
A      2025-05-07    106.69    107.60   104.79     107.52  2143700
       2025-05-06    107.25    108.21   104.36     105.24  1960600
       2025-05-05    108.10    109.25   107.46     108.37  1385500
       2025-05-02    109.24    110.33   107.45     108.63  1213100
       2025-05-01    107.25    108.23   104.10     106.46  1523100
