### Deep Analysis of the Processed Dataset

In [5]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "C:/Users/huzey/Desktop/stock_prediction/data/processed/AAPL_model_ready_final.csv"
df = pd.read_csv(file_path)

# Print basic information about the dataset
print("=" * 50)
print("1. Dataset Overview")
print("=" * 50)
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:\n{df.head()}")
print(f"\nLast 5 rows:\n{df.tail()}")

# Check for missing values
print("\n" + "=" * 50)
print("2. Missing Values Analysis")
print("=" * 50)
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0] if missing_values.any() else "No missing values found.")

# Analyze the target variable
if 'target_direction' in df.columns:
    print("\n" + "=" * 50)
    print("3. Target Variable Analysis")
    print("=" * 50)
    print(f"Target variable distribution:\n{df['target_direction'].value_counts()}")
    print(f"Percentage of 'up' days: {df['target_direction'].mean() * 100:.2f}%")

# Analyze feature distributions
print("\n" + "=" * 50)
print("4. Feature Distributions")
print("=" * 50)
print("Descriptive statistics for numerical features:")
print(df.describe())

# Analyze time series properties
if 'date' in df.columns:
    print("\n" + "=" * 50)
    print("5. Time Series Properties")
    print("=" * 50)
    df['date'] = pd.to_datetime(df['date'])
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"Total number of days: {(df['date'].max() - df['date'].min()).days}")
    print(f"Number of unique days: {df['date'].nunique()}")
    print(f"Missing dates: {pd.date_range(start=df['date'].min(), end=df['date'].max()).difference(df['date'])}")

# Analyze correlation between features and target
if 'target_direction' in df.columns:
    print("\n" + "=" * 50)
    print("6. Correlation Analysis")
    print("=" * 50)
    correlation_with_target = df.corr()['target_direction'].sort_values(ascending=False)
    print("Correlation of features with target variable:")
    print(correlation_with_target)

# Analyze stationarity (ADF test)
from statsmodels.tsa.stattools import adfuller

if 'close' in df.columns:
    print("\n" + "=" * 50)
    print("7. Stationarity Analysis (ADF Test)")
    print("=" * 50)
    adf_result = adfuller(df['close'].dropna())
    print(f"ADF Statistic: {adf_result[0]}")
    print(f"p-value: {adf_result[1]}")
    print(f"Critical Values: {adf_result[4]}")
    if adf_result[1] <= 0.05:
        print("The time series is stationary (reject the null hypothesis).")
    else:
        print("The time series is non-stationary (fail to reject the null hypothesis).")

# Analyze autocorrelation
if 'close' in df.columns:
    print("\n" + "=" * 50)
    print("8. Autocorrelation Analysis")
    print("=" * 50)
    autocorrelation = df['close'].autocorr()
    print(f"Autocorrelation of 'close' prices (lag=1): {autocorrelation:.4f}")

# Analyze feature importance (using correlation as a proxy)
if 'target_direction' in df.columns:
    print("\n" + "=" * 50)
    print("9. Feature Importance (Correlation with Target)")
    print("=" * 50)
    correlation_matrix = df.corr()
    target_correlation = correlation_matrix['target_direction'].sort_values(ascending=False)
    print("Features most correlated with target variable:")
    print(target_correlation)

# Analyze multicollinearity
print("\n" + "=" * 50)
print("10. Multicollinearity Analysis")
print("=" * 50)
correlation_matrix = df.corr()
high_correlation_pairs = correlation_matrix.abs().unstack().sort_values(ascending=False)
high_correlation_pairs = high_correlation_pairs[high_correlation_pairs < 1].drop_duplicates()
print("Top feature pairs with high correlation:")
print(high_correlation_pairs.head(10))

# Analyze outliers
print("\n" + "=" * 50)
print("11. Outlier Analysis")
print("=" * 50)
numerical_features = df.select_dtypes(include=[np.number]).columns
for feature in numerical_features:
    q1 = df[feature].quantile(0.25)
    q3 = df[feature].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    print(f"Outliers in '{feature}': {len(outliers)}")

# Analyze time-based trends
if 'date' in df.columns and 'close' in df.columns:
    print("\n" + "=" * 50)
    print("12. Time-Based Trends")
    print("=" * 50)
    monthly_avg_close = df.set_index('date')['close'].resample('M').mean()
    print("Monthly average closing prices:")
    print(monthly_avg_close)

# Analyze seasonality
if 'date' in df.columns and 'close' in df.columns:
    print("\n" + "=" * 50)
    print("13. Seasonality Analysis")
    print("=" * 50)
    df['month'] = df['date'].dt.month
    monthly_avg_close = df.groupby('month')['close'].mean()
    print("Average closing prices by month:")
    print(monthly_avg_close)

# Print final summary
print("\n" + "=" * 50)
print("14. Summary of Findings")
print("=" * 50)
print("1. The dataset contains time series data with features relevant to stock price prediction.")
print("2. Missing values, if any, should be handled before modeling.")
print("3. The target variable should be analyzed for class balance and distribution.")
print("4. Feature distributions and correlations should be examined to identify important predictors.")
print("5. Time series properties (e.g., stationarity, autocorrelation) should be analyzed to ensure the data is suitable for time series models.")
print("6. Outliers and multicollinearity should be addressed to improve model performance.")

1. Dataset Overview
Number of rows: 1740
Number of columns: 16
Columns: ['date', 'open', 'close', 'volume', 'dividends', 'stock splits', 'daily_return', 'intraday_return', 'daily_range', 'gap_up', 'day_of_week', 'rsi', 'bb_width', 'macd', 'volume_ratio', 'target_return']

First 5 rows:
                  date      open     close    volume  dividends  stock splits  \
0  2018-01-31 05:00:00 -0.837704 -0.841380  0.739003        0.0           0.0   
1  2018-02-01 05:00:00 -0.837073 -0.840643  1.745696        0.0           0.0   
2  2018-02-02 05:00:00 -0.839532 -0.855969  4.431889        0.0           0.0   
3  2018-02-05 05:00:00 -0.854032 -0.864410  3.486382        0.0           0.0   
4  2018-02-06 05:00:00 -0.863005 -0.850643  3.179656        0.0           0.0   

   daily_return  intraday_return  daily_range  gap_up  day_of_week       rsi  \
0      0.066536         0.143108    -0.529443    -1.0          0.0 -0.914458   
1      0.032681         0.161250    -0.569306    -1.0          0.5

  monthly_avg_close = df.set_index('date')['close'].resample('M').mean()
