#### Save only the "Adj Close" data in df_OHLCV_clean_stocks_etfs.parquet to df_adj_close.parquet

In [1]:
import sys
from pathlib import Path
import pandas as pd
import os
from IPython.display import display, Markdown  # Assuming you use these for display


# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
pd.set_option('display.width', 2500)        # Let the display adjust to the window


# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils



Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\python311.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\Lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py311\\stocks\\src']


In [2]:
PATH_OHLCV = ROOT_DIR / 'data' / 'df_OHLCV_clean_stocks_etfs.parquet'
PATH_ADJ_CLOSE = ROOT_DIR / 'data' / 'df_adj_close.parquet'

print(f'PATH_OHLCV: {PATH_OHLCV}')
df_OHLCV = pd.read_parquet(PATH_OHLCV)
print(f'df_OHLCV:\n{df_OHLCV.head()}\n')
print(f'df_OHLCV.info():\n{df_OHLCV.info()}')

PATH_OHLCV: c:\Users\ping\Files_win10\python\py311\stocks\data\df_OHLCV_clean_stocks_etfs.parquet
df_OHLCV:
                   Adj Open  Adj High  Adj Low  Adj Close   Volume
Symbol Date                                                       
A      2025-05-14    114.95    115.50   111.28     111.52  2154239
       2025-05-13    115.43    116.88   114.82     115.42  2845300
       2025-05-12    110.81    115.71   110.45     115.55  2873100
       2025-05-09    108.96    109.86   106.79     106.93  1369500
       2025-05-08    108.00    110.65   106.55     108.70  2093300

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 503286 entries, ('A', Timestamp('2025-05-14 00:00:00')) to ('ZWS', Timestamp('2024-02-01 00:00:00'))
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Adj Open   503286 non-null  float64
 1   Adj High   503286 non-null  float64
 2   Adj Low    503286 non-null  float64
 3   Adj Close  503286 non-null  f

In [3]:
import pandas as pd

# 1. Select the 'Adj Close' column (this returns a Series with the MultiIndex)
adj_close_series = df_OHLCV['Adj Close']

# 2. Unstack the Ticker level (level 0) of the MultiIndex to become columns
#    The Date level (level 1) will remain as the index.
df_adj_close = adj_close_series.unstack(level=0)

# 3. Optional: Sort the index (Dates) if it's not already sorted
df_adj_close = df_adj_close.sort_index()

# 4. Optional: Sort columns (Tickers) alphabetically if desired
df_adj_close = df_adj_close.sort_index(axis=1)

# Display the results
print("--- Resulting DataFrame for Backtesting (df_adj_close) ---")
print(df_adj_close.info())
print("\n--- First 5 rows of df_adj_close: ---")
print(df_adj_close.head())
print("\n--- Last 5 rows of df_adj_close: ---")
print(df_adj_close.tail())

df_adj_close.to_parquet(PATH_ADJ_CLOSE, index=True)
print(f"\nSaved df_adj_close to 'df_adj_close.parquet' in {PATH_ADJ_CLOSE}")
# _df = pd.read_parquet('df_adj_close.parquet')
# print(f"\nLoaded df_adj_close from 'df_adj_close.parquet':\n{_df.head()}\n")

--- Resulting DataFrame for Backtesting (df_adj_close) ---
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 322 entries, 2024-02-01 to 2025-05-14
Columns: 1563 entries, A to ZWS
dtypes: float64(1563)
memory usage: 3.8 MB
None

--- First 5 rows of df_adj_close: ---
Symbol            A       AA    AAL     AAON     AAPL     ABBV     ABEV    ABNB      ABT     ACGL  ACHR      ACI   ACIW      ACM      ACN     ACWI      ACWV     ACWX    ADBE      ADC      ADI      ADM  ADMA      ADP    ADSK      ADT      AEE      AEG      AEM      AEP      AER      AES      AFG      AFL   AFRM     AGCO      AGG      AGI     AGNC      AIG     AIRR      AIT      AIZ      AJG    AKAM       AL      ALB      ALC    ALGN    ALK      ALL     ALLE     ALLY    ALNY     ALSN      ALV       AM     AMAT     AMCR     AMD      AME     AMGN      AMH     AMLP      AMP      AMT      AMX    AMZN      AN     ANET    ANSS      AON      AOS      APA      APD    APG      APH       APO    APP    APPF   APTV     AR     ARCC     