# OHLC Data Exploration Notebook

This notebook demonstrates how to explore market data and understand the features for OHLC price prediction.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.append('../src')

from data import DataFetcher, DataPreprocessor
from features import FeatureEngineer, TechnicalIndicators
from utils import config

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

%matplotlib inline

## 1. Data Fetching

Let's start by fetching some market data for analysis.

In [None]:
# Initialize data fetcher
data_fetcher = DataFetcher()

# Fetch data for Apple (AAPL)
symbol = 'AAPL'
raw_data = data_fetcher.fetch_symbol_data(symbol, period='2y', save_to_file=True)

print(f"Data shape: {raw_data.shape}")
print(f"Date range: {raw_data.index.min()} to {raw_data.index.max()}")
raw_data.head()

## 2. Basic Data Analysis

In [None]:
# Basic statistics
print("Basic Statistics:")
print(raw_data.describe())

# Check for missing values
print("\nMissing Values:")
print(raw_data.isnull().sum())

# Data types
print("\nData Types:")
print(raw_data.dtypes)

In [None]:
# Plot OHLC data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price plot
axes[0, 0].plot(raw_data.index, raw_data['Close'], label='Close', alpha=0.8)
axes[0, 0].plot(raw_data.index, raw_data['Open'], label='Open', alpha=0.6)
axes[0, 0].set_title(f'{symbol} - Open vs Close Prices')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# High-Low range
axes[0, 1].fill_between(raw_data.index, raw_data['Low'], raw_data['High'], alpha=0.3, label='High-Low Range')
axes[0, 1].plot(raw_data.index, raw_data['Close'], color='red', label='Close', alpha=0.8)
axes[0, 1].set_title(f'{symbol} - Price Range')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Volume
axes[1, 0].bar(raw_data.index, raw_data['Volume'], alpha=0.6, width=1)
axes[1, 0].set_title(f'{symbol} - Trading Volume')
axes[1, 0].grid(True, alpha=0.3)

# Price returns
returns = raw_data['Close'].pct_change().dropna()
axes[1, 1].hist(returns, bins=50, alpha=0.7, edgecolor='black')
axes[1, 1].set_title(f'{symbol} - Daily Returns Distribution')
axes[1, 1].set_xlabel('Daily Return')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Feature Engineering

Let's create technical indicators and other features.

In [None]:
# Initialize feature engineer and preprocessor
preprocessor = DataPreprocessor()
feature_engineer = FeatureEngineer()
technical_indicators = TechnicalIndicators()

# Clean and engineer features
cleaned_data = preprocessor.clean_data(raw_data)
featured_data = feature_engineer.engineer_features(cleaned_data)

print(f"Original features: {raw_data.shape[1]}")
print(f"Engineered features: {featured_data.shape[1]}")
print(f"New features created: {featured_data.shape[1] - raw_data.shape[1]}")

In [None]:
# Display some key technical indicators
fig, axes = plt.subplots(3, 2, figsize=(15, 12))

# Moving averages
axes[0, 0].plot(featured_data.index, featured_data['Close'], label='Close', alpha=0.8)
axes[0, 0].plot(featured_data.index, featured_data['SMA_20'], label='SMA 20', alpha=0.7)
axes[0, 0].plot(featured_data.index, featured_data['SMA_50'], label='SMA 50', alpha=0.7)
axes[0, 0].set_title('Price with Moving Averages')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# RSI
axes[0, 1].plot(featured_data.index, featured_data['RSI'], color='purple')
axes[0, 1].axhline(y=70, color='r', linestyle='--', alpha=0.7, label='Overbought')
axes[0, 1].axhline(y=30, color='g', linestyle='--', alpha=0.7, label='Oversold')
axes[0, 1].set_title('RSI (Relative Strength Index)')
axes[0, 1].set_ylim(0, 100)
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# MACD
axes[1, 0].plot(featured_data.index, featured_data['MACD'], label='MACD', alpha=0.8)
axes[1, 0].plot(featured_data.index, featured_data['MACD_Signal'], label='Signal', alpha=0.8)
axes[1, 0].bar(featured_data.index, featured_data['MACD_Histogram'], alpha=0.3, label='Histogram')
axes[1, 0].set_title('MACD')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Bollinger Bands
axes[1, 1].plot(featured_data.index, featured_data['Close'], label='Close', alpha=0.8)
axes[1, 1].plot(featured_data.index, featured_data['BB_Upper'], label='BB Upper', alpha=0.7)
axes[1, 1].plot(featured_data.index, featured_data['BB_Lower'], label='BB Lower', alpha=0.7)
axes[1, 1].fill_between(featured_data.index, featured_data['BB_Lower'], featured_data['BB_Upper'], alpha=0.1)
axes[1, 1].set_title('Bollinger Bands')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# Volatility
axes[2, 0].plot(featured_data.index, featured_data['Volatility_20d'], label='20-day Volatility')
axes[2, 0].set_title('Price Volatility')
axes[2, 0].legend()
axes[2, 0].grid(True, alpha=0.3)

# Volume indicators
axes[2, 1].plot(featured_data.index, featured_data['Volume'], alpha=0.6, label='Volume')
axes[2, 1].plot(featured_data.index, featured_data['Volume_SMA_20'], label='Volume SMA 20')
axes[2, 1].set_title('Volume Analysis')
axes[2, 1].legend()
axes[2, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Feature Correlation Analysis

In [None]:
# Select numeric features for correlation analysis
numeric_features = featured_data.select_dtypes(include=[np.number]).columns
correlation_data = featured_data[numeric_features].dropna()

# Calculate correlation with target (Close price)
target_corr = correlation_data.corr()['Close'].abs().sort_values(ascending=False)

print("Top 20 features correlated with Close price:")
print(target_corr.head(20))

In [None]:
# Plot correlation heatmap for top features
top_features = target_corr.head(15).index
corr_matrix = correlation_data[top_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix - Top Features')
plt.tight_layout()
plt.show()

## 5. Data Preparation for Modeling

In [None]:
# Prepare data for modeling
target_columns = ['Close']
feature_columns = [col for col in featured_data.columns 
                  if col not in ['Symbol'] + target_columns]

# Remove NaN values
model_data = featured_data.dropna()

print(f"Model data shape: {model_data.shape}")
print(f"Features: {len(feature_columns)}")
print(f"Targets: {len(target_columns)}")

# Show data distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Feature distribution (sample)
sample_features = model_data[feature_columns].sample(n=5, axis=1)
sample_features.hist(bins=30, ax=axes[0], alpha=0.7)
axes[0].set_title('Sample Feature Distributions')

# Target distribution
axes[1].hist(model_data['Close'], bins=50, alpha=0.7, edgecolor='black')
axes[1].set_title('Target (Close Price) Distribution')
axes[1].set_xlabel('Close Price')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 6. Time Series Analysis

In [None]:
# Analyze time series properties
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Test for stationarity
def check_stationarity(timeseries, title):
    result = adfuller(timeseries.dropna())
    print(f'\n{title}:')
    print(f'ADF Statistic: {result[0]:.6f}')
    print(f'p-value: {result[1]:.6f}')
    print(f'Critical Values:')
    for key, value in result[4].items():
        print(f'\t{key}: {value:.3f}')
    
    if result[1] <= 0.05:
        print("Result: Series is stationary")
    else:
        print("Result: Series is non-stationary")

# Check stationarity of price and returns
check_stationarity(model_data['Close'], 'Close Price')
check_stationarity(model_data['Close'].pct_change(), 'Price Returns')

In [None]:
# Plot ACF and PACF for returns
returns = model_data['Close'].pct_change().dropna()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

plot_acf(returns, ax=axes[0], lags=40)
axes[0].set_title('Autocorrelation Function - Returns')

plot_pacf(returns, ax=axes[1], lags=40)
axes[1].set_title('Partial Autocorrelation Function - Returns')

plt.tight_layout()
plt.show()

## 7. Summary and Next Steps

This notebook has provided a comprehensive exploration of the OHLC data including:

1. **Data Quality**: Checked for missing values and data consistency
2. **Feature Engineering**: Created technical indicators and custom features
3. **Correlation Analysis**: Identified features most correlated with target
4. **Time Series Properties**: Analyzed stationarity and autocorrelation

### Key Findings:
- The data shows typical financial time series characteristics
- Technical indicators provide valuable signals
- Price series is non-stationary but returns are stationary
- Strong correlations exist between price-based features

### Next Steps:
1. Use this analysis to inform model selection
2. Consider feature selection based on correlation analysis
3. Apply appropriate preprocessing for time series modeling
4. Train and evaluate different model types (LSTM, RF, XGBoost)