In [None]:
# %%
# filename: data_load_and_explore.ipynb

# --- 1. Imports and Setup ---
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# --- Plotting Configuration ---
%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
print("Libraries imported and plotting configured.")

In [None]:
# %%
# --- 2. Configuration ---
# This project expects the necessary Parquet files to be manually placed
# in its own 'data/parquet' directory.

# Construct the path to the local data directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_dir = os.path.join(project_root, "data", "parquet")

# Specify the exact file you want to analyze from the local directory
# << COPY THE PARQUET FILE HERE AND UPDATE THE FILENAME >>
DATA_FILENAME = "EURUSD_M15_2024-09-14_to_2025-09-14.parquet"
file_path = os.path.join(data_dir, DATA_FILENAME)

print(f"Attempting to load data from local path: {file_path}")

In [None]:
# %%
# --- 3. Data Loading & Initial Inspection ---
try:
    df = pd.read_parquet(file_path)
    print("Parquet file loaded successfully!")

    print("\n--- DataFrame Info ---")
    # .info() gives a great overview of columns, data types, and non-null values
    df.info()

    print(f"\nShape of the data: {df.shape[0]} rows, {df.shape[1]} columns")

    print("\n--- First 5 Rows (Head) ---")
    print(df.head())

except FileNotFoundError:
    print(f"ERROR: File not found at '{file_path}'.")
    print("Please ensure the 'py-mt5-trader' project has been run and the filename is correct.")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

In [None]:
# %%
# --- 4. Data Preprocessing ---
# For time series analysis, the 'Time' column should be the index.

print("Preprocessing data...")
# Set the 'Time' column as the DataFrame index
df.set_index('Time', inplace=True)

# Verify that the index is a DatetimeIndex
print(f"Index type: {type(df.index)}")
print(f"Is the index timezone-aware? {'Yes' if df.index.tz is not None else 'No'}")

# Check for duplicate timestamps in the index
duplicate_count = df.index.duplicated().sum()
if duplicate_count > 0:
    print(f"\nWarning: Found {duplicate_count} duplicate timestamps in the index. Consider handling them.")
else:
    print("\nNo duplicate timestamps found in the index. Good.")

print("\nPreprocessing complete. DataFrame is ready for analysis.")

In [None]:
# %%
# --- 5. Exploratory Data Analysis (EDA) ---

# 1. Plot the closing price to get a feel for the overall trend
print("Plotting closing price...")
df['Close'].plot(figsize=(15, 7), title=f'{DATA_FILENAME} - Closing Price')
plt.ylabel('Price')
plt.show()

# 2. Calculate and plot the distribution of returns
print("\nAnalyzing price returns...")
# Calculate the percentage change between each bar
returns = df['Close'].pct_change().dropna()

# Plot a histogram of the returns
plt.figure(figsize=(12, 6))
sns.histplot(returns, bins=100, kde=True)
plt.title('Distribution of Price Returns')
plt.xlabel('Return')
plt.ylabel('Frequency')
plt.show()

print(f"Average Return: {returns.mean():.6f}")
print(f"Standard Deviation (Volatility): {returns.std():.6f}")