In [1]:
import os
import time
import datetime

def get_latest_downloaded_files(directory, num_files=10):
    """
    Lists the N most recent files in a directory, sorted by modification time.

    Args:
        directory (str): The path to the directory to search.
        num_files (int): The number of files to list (default: 10).

    Returns:
        list: A list of tuples, where each tuple contains:
              (filename, file_size_bytes, last_modified_time)
              Returns an empty list if the directory doesn't exist or is empty.
    """

    if not os.path.exists(directory):
        print(f"Error: Directory '{directory}' not found.")
        return []

    try:
        files = [(f, os.path.getsize(os.path.join(directory, f)), os.path.getmtime(os.path.join(directory, f)))
                 for f in os.listdir(directory)
                 if os.path.isfile(os.path.join(directory, f))]  # Check if it's a file

        # Sort files by modification time (most recent first)
        files.sort(key=lambda x: x[2], reverse=True)

        return files[:num_files]  # Return the top N files
    except OSError as e:
        print(f"Error accessing directory: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []


def main():
    # Number of files to retrieve
    num_files = 5
    
    # Get the user's Downloads directory
    downloads_dir = os.path.expanduser("~\\Downloads")  # Windows-specific

    recent_files = get_latest_downloaded_files(downloads_dir, num_files=num_files)

    if recent_files:
        print(f"{num_files} Most Recent Files in Downloads:")
        for filename, size, last_modified_time in recent_files:
            # Format file size for readability
            size_kb = size / 1024
            size_mb = size_kb / 1024
            if size_mb > 1:
                file_size = f"{size_mb:.2f} MB"
            elif size_kb > 1:
                file_size = f"{size_kb:.2f} KB"
            else:
                file_size = f"{size} bytes"
            
            # Format last modified time
            formatted_time = datetime.datetime.fromtimestamp(last_modified_time).strftime('%Y-%m-%d %H:%M:%S')

            print(f"  - Name: {filename}")
            print(f"    Size: {file_size}")
            print(f"    Last Modified: {formatted_time}")
    else:
        print("No files found in the Downloads directory.")

if __name__ == "__main__":
    main()

5 Most Recent Files in Downloads:
  - Name: adj_close_prices.csv
    Size: 324.68 KB
    Last Modified: 2025-03-02 17:07:01
  - Name: CursorUserSetup-x64-0.46.8.exe
    Size: 106.36 MB
    Last Modified: 2025-03-02 08:23:57
  - Name: _df.pkl
    Size: 34.09 MB
    Last Modified: 2025-02-28 16:58:12
  - Name: OHLCV.pkl
    Size: 46.35 MB
    Last Modified: 2025-02-28 16:48:26
  - Name: download_stocks_ETFs_OHLCV_v3.ipynb
    Size: 361.86 KB
    Last Modified: 2025-02-28 16:45:16


In [2]:
# Retrieve pickled dataframe with symbols' OHLCV
filename = "OHLCV.pkl"

In [3]:
import os

def get_download_path(filename):
  """
  Constructs the full path to a file in the Windows Downloads directory.

  Args:
    filename: The name of the file.

  Returns:
    A string representing the absolute path to the file, or None if the
    Downloads directory cannot be found.
  """
  if os.name == 'nt':  # Check if running on Windows
    try:
      # Method 1: Using the 'USERPROFILE' environment variable
      downloads_path = os.path.join(os.environ['USERPROFILE'], 'Downloads')
      full_path = os.path.join(downloads_path, filename)
      return full_path
    except KeyError:
      # Method 2: If 'USERPROFILE' isn't set, try 'HOMEPATH'
      try:
          downloads_path = os.path.join(os.environ['HOMEDRIVE'], os.environ['HOMEPATH'], 'Downloads')
          full_path = os.path.join(downloads_path, filename)
          return full_path
      except KeyError:
        print("Error: Unable to find the Downloads directory using environment variables.")
        return None
  else:
    print("This function is designed for Windows systems.")
    return None



In [4]:
# Example usage:
full_path = get_download_path(filename)

if full_path:
  print(f"The full path to '{filename}' is: {full_path}")
else:
  print(f"Could not determine the full path to '{filename}'.")

The full path to 'OHLCV.pkl' is: C:\Users\ping\Downloads\OHLCV.pkl


In [5]:
import pandas as pd

# Load the DataFrame from the pickle file
df = pd.read_pickle(full_path)

# Display the first few rows of the DataFrame to verify
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 630966 entries, ('AAPL', Timestamp('2025-02-28 00:00:00')) to ('IBTE', Timestamp('2024-03-01 00:00:00'))
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Open       630966 non-null  float64
 1   High       630966 non-null  float64
 2   Low        630966 non-null  float64
 3   Close      630966 non-null  float64
 4   Adj Close  630966 non-null  float64
 5   Volume     630764 non-null  Int64  
 6   Adj Open   630966 non-null  float64
 7   Adj High   630966 non-null  float64
 8   Adj Low    630966 non-null  float64
dtypes: Int64(1), float64(8)
memory usage: 46.4+ MB


In [6]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Adj Close,Volume,Adj Open,Adj High,Adj Low
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL,2025-02-28,236.95,242.09,234.51,241.84,241.84,55162416,236.950000,242.090000,234.510000
AAPL,2025-02-27,239.41,242.46,237.06,237.30,237.30,41153600,239.410000,242.460000,237.060000
AAPL,2025-02-26,244.33,244.98,239.13,240.36,240.36,44433600,244.330000,244.980000,239.130000
AAPL,2025-02-25,248.00,250.00,244.91,247.04,247.04,48013300,248.000000,250.000000,244.910000
AAPL,2025-02-24,244.93,248.86,244.42,247.10,247.10,51326400,244.930000,248.860000,244.420000
...,...,...,...,...,...,...,...,...,...,...
IBTE,2024-03-07,23.89,23.89,23.87,23.88,22.98,348642,22.989623,22.989623,22.970377
IBTE,2024-03-06,23.86,23.87,23.86,23.87,22.97,399535,22.960377,22.970000,22.960377
IBTE,2024-03-05,23.87,23.87,23.86,23.87,22.97,572369,22.970000,22.970000,22.960377
IBTE,2024-03-04,23.86,23.87,23.86,23.87,22.97,316415,22.960377,22.970000,22.960377


In [7]:
def read_tickers_from_file(filename):
  """
  Reads a list of stock tickers from a text file.

  Args:
    filename (str): The path to the text file containing the tickers, one ticker per line.

  Returns:
    list: A list of strings, where each string is a ticker symbol.
        Returns an empty list if the file does not exist or is empty.
  """
  try:
    with open(filename, 'r') as f:
      tickers = [line.strip() for line in f]  # Read each line, strip whitespace
    return tickers
  except FileNotFoundError:
    print(f"Error: File '{filename}' not found.")
    return []
  except Exception as e:
    print(f"An error occurred: {e}")
    return []

# Example usage:
tickers_filename = "tickers.txt"  # Replace with the actual filename if different
tickers = read_tickers_from_file(tickers_filename)

if tickers:
  print(f"Read {len(tickers)} tickers from '{tickers_filename}'.")
  print(f"First 10 tickers: {tickers[:10]}")  # Print the first 10 tickers
else:
  print("No tickers were read from the file.")

Read 200 tickers from 'tickers.txt'.
First 10 tickers: ['AAPL', 'ABBV', 'ABNB', 'ABT', 'ACN', 'ADBE', 'ADI', 'ADP', 'ADSK', 'AJG']


In [8]:
df_adj_close = df.loc[(tickers, slice(None)), 'Adj Close']
# print(df_adj_close.info())
df_adj_close =df_adj_close.unstack(level=0)

In [10]:
df_adj_close

Symbol,AAPL,ABBV,ABNB,ABT,ACN,ADBE,ADI,ADP,ADSK,AJG,...,USB,V,VZ,WDAY,WELL,WFC,WM,WMB,WMT,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-03-01,178.82,172.62,159.72,116.25,376.33,570.93,192.89,244.33,264.74,241.99,...,39.59,281.06,37.66,291.92,90.77,53.75,202.88,34.84,58.06,188.81
2024-03-04,174.28,170.83,158.09,117.64,379.02,567.94,192.31,240.91,260.70,240.97,...,40.81,278.45,37.76,273.02,91.43,54.49,203.88,34.81,58.60,185.88
2024-03-05,169.32,172.86,159.33,116.11,372.27,544.84,187.51,238.76,247.29,243.79,...,41.34,277.31,37.80,266.74,89.62,55.26,203.95,35.12,59.33,183.28
2024-03-06,168.33,174.70,163.87,116.96,376.17,543.09,189.39,238.52,251.10,247.03,...,41.18,278.35,37.40,266.93,90.13,55.75,204.98,34.96,59.85,181.59
2024-03-07,168.21,174.22,163.54,118.50,382.18,556.04,195.85,240.38,253.84,248.32,...,41.42,276.20,37.03,264.74,90.10,55.65,204.33,34.83,59.65,179.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-24,247.10,204.08,144.82,134.95,363.91,444.42,236.99,312.20,285.26,328.49,...,45.61,349.86,43.32,261.81,149.97,77.22,229.68,57.33,93.67,165.31
2025-02-25,247.04,204.14,141.55,135.82,362.95,443.41,235.30,313.37,283.72,331.83,...,45.91,352.09,43.71,255.22,151.74,76.00,230.75,56.18,97.69,168.53
2025-02-26,240.36,203.01,144.03,135.96,358.35,441.50,233.73,311.38,285.67,328.33,...,45.75,350.63,43.18,271.09,150.65,76.45,228.69,56.90,96.20,164.99
2025-02-27,237.30,205.02,139.45,135.87,356.87,437.19,223.85,311.30,282.35,333.17,...,46.28,355.74,43.27,260.57,151.52,76.62,229.61,56.26,96.79,164.98


In [11]:
import seaborn as sns

def create_correlation_matrix_heatmap(df_adj_close, symbols, plot_heatmap=False):
  """
  Creates a correlation matrix for the specified symbols and optionally plots a heatmap.
  
  Args:
    df_adj_close (pd.DataFrame): DataFrame containing adjusted close prices
    symbols (list): List of symbols to analyze
    plot_heatmap (bool): Whether to plot the heatmap (default: False)
  
  Returns:
    pd.DataFrame: Correlation matrix for the specified symbols
  """
  try:
    # # Select the specified symbols and calculate correlation matrix
    # correlation_matrix = df_adj_close[symbols].corr()

    # Sort the DataFrame by index, older dates first
    df_adj_close = df_adj_close.sort_index(ascending=True)
    
    # Calculate daily returns
    daily_returns = df_adj_close[symbols].pct_change()
    
    # Calculate correlation matrix of daily returns
    correlation_matrix = daily_returns.corr()

    # If plot_heatmap is True, create and display the heatmap
    if plot_heatmap:
      import matplotlib.pyplot as plt
      
      # Create a heatmap
      plt.figure(figsize=(12, 10))
      sns.heatmap(correlation_matrix, 
          annot=True,  # Show correlation values
          cmap="Spectral_r",  # Color scheme
          vmin=-1, vmax=1,  # Value range
          center=0,  # Center the colormap at 0
          fmt='.2f',  # Format for correlation values
          xticklabels=True,  # Show x-axis labels
          yticklabels=True,  # Show y-axis labels
          cbar_kws={'orientation': 'horizontal'},  # Horizontal colorbar
          annot_kws={'size': 8})  # Adjust annotation text size
      
      # Move x-axis labels to top
      plt.tick_params(axis='x', rotation=45, labelrotation=45)
      plt.tick_params(axis='y', rotation=0)
      ax = plt.gca()
      ax.xaxis.set_ticks_position('top')
      ax.xaxis.set_label_position('top')
      
      plt.title('Stock Price Daily Returns Correlation Heatmap', pad=20)
      plt.tight_layout()
      plt.show()
      
    return correlation_matrix
    
  except Exception as e:
    print(f"Error creating correlation matrix: {e}")
    return None

Select symbols for correlation_matrix

In [None]:
#Choose a subset of tickers to analyze, otherwise the heatmap will be too large
symbols = df_adj_close.columns[:10]  # Example symbols
# symbols = df_adj_close.columns  # Example symbols

In [None]:
correlation_matrix = create_correlation_matrix_heatmap(df_adj_close, symbols, plot_heatmap=False)

if correlation_matrix is not None:
  print(f'correlation_matrix.shape:\n{correlation_matrix.shape}')
  print("\nCorrelation Matrix:")
  print(correlation_matrix)

correlation_matrix.shape:
(10, 10)

Correlation Matrix:
Symbol      AAPL      ABBV      ABNB       ABT       ACN      ADBE       ADI  \
Symbol                                                                         
AAPL    1.000000 -0.002380  0.246946 -0.100050  0.146706  0.186429  0.311385   
ABBV   -0.002380  1.000000 -0.066304  0.305343  0.009229  0.073083  0.022998   
ABNB    0.246946 -0.066304  1.000000 -0.081069  0.148769  0.234533  0.364558   
ABT    -0.100050  0.305343 -0.081069  1.000000  0.196655  0.124469  0.097328   
ACN     0.146706  0.009229  0.148769  0.196655  1.000000  0.293278  0.215907   
ADBE    0.186429  0.073083  0.234533  0.124469  0.293278  1.000000  0.198925   
ADI     0.311385  0.022998  0.364558  0.097328  0.215907  0.198925  1.000000   
ADP     0.065798  0.283726  0.063339  0.341858  0.312154  0.248532  0.283935   
ADSK    0.284896 -0.002762  0.277288  0.161622  0.290390  0.323703  0.374405   
AJG     0.003710  0.304281 -0.088303  0.291845  0.206658  0.0784

In [16]:
# Convert correlation matrix to list of dictionaries
correlation_list = []
for symbol1 in correlation_matrix.index:
  row_dict = {}
  for symbol2 in correlation_matrix.columns:
    row_dict[symbol2] = correlation_matrix.loc[symbol1, symbol2]
  correlation_list.append({'Symbol': symbol1, 'Correlations': row_dict})

# Format as string representation
correlation_str = str(correlation_list)
print(f'len(correlation_list): {len(correlation_list)}')

# Print the string representation of the list of dictionaries (correlation_str)
print(f'\ncorrelation_str:\n{correlation_str}')

len(correlation_list): 10

correlation_str:
[{'Symbol': 'AAPL', 'Correlations': {'AAPL': 1.0, 'ABBV': -0.002380449693042012, 'ABNB': 0.2469455806979006, 'ABT': -0.1000501832946136, 'ACN': 0.14670574465291802, 'ADBE': 0.18642902770019099, 'ADI': 0.31138484947925504, 'ADP': 0.06579807177129417, 'ADSK': 0.2848961904196655, 'AJG': 0.003709783408893}}, {'Symbol': 'ABBV', 'Correlations': {'AAPL': -0.002380449693042012, 'ABBV': 1.0, 'ABNB': -0.06630440459798524, 'ABT': 0.3053432333358742, 'ACN': 0.009229271589594438, 'ADBE': 0.07308304619892582, 'ADI': 0.02299802342657832, 'ADP': 0.28372551310009436, 'ADSK': -0.00276177340018698, 'AJG': 0.3042813346694634}}, {'Symbol': 'ABNB', 'Correlations': {'AAPL': 0.2469455806979006, 'ABBV': -0.06630440459798524, 'ABNB': 1.0, 'ABT': -0.08106859543808247, 'ACN': 0.1487690482605738, 'ADBE': 0.23453348298486199, 'ADI': 0.36455785082551145, 'ADP': 0.06333872371363446, 'ADSK': 0.2772884183051858, 'AJG': -0.0883025404424041}}, {'Symbol': 'ABT', 'Correlations': 