#Exploratory Data Analysis (EDA)

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import statsmodels as sm

In [None]:
# decomposition
from statsmodels.tsa.seasonal import seasonal_decompose, STL
# stationary
from statsmodels.tsa.stattools import adfuller, kpss
from scipy.stats import ks_2samp
# autocorrelation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
# for interactive plot
import ipywidgets as widgets
from ipywidgets import interact

In [None]:
import ipykernel
ipykernel.version_info

## Data Load and Feature Computation

In [None]:
time_df = yf.download('MSFT', start="2025-01-01", auto_adjust=True)
time_df.columns = [col[0] for col in time_df.columns]
time_df.head()

In [None]:
time_df['Return'] = time_df['Close'].pct_change()
time_df['LogReturn'] = np.log(time_df['Close'] / time_df['Close'].shift(1))
time_df.describe()

## General Data Exploration

### Diagnose date gaps

In [None]:
time_df.index.diff().value_counts().head()

In [None]:
# df for gap vs jump relation
analysis_df = time_df.copy()
analysis_df['GapDays'] = analysis_df.index.to_series().diff().dt.days
analysis_df['AbsJumpInClose'] = analysis_df['Close'].diff().abs()
analysis_df = analysis_df.dropna()

In [None]:
sns.boxplot(data=analysis_df, x = 'GapDays', y = 'AbsJumpInClose')
plt.title('Jumps vs Gaps Relation')
plt.xlabel('Days since Last Record')
plt.ylabel('Absolute Shift in Stock Closing Value')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()

In [None]:
analysis_df.groupby('GapDays')['AbsJumpInClose'].describe()

* The longer gaps (more than 1 day) doesn't affect the jumps, suggesting no abnormality in stock data.

* The consecutive days have higher volatility and are more concerning.

### Distribution

In [None]:
time_df["Return"].hist(bins=50)
plt.title("Return Distribution")
plt.show()

In [None]:
time_df["LogReturn"].hist(bins=50)
plt.title("LogReturn Distribution")
plt.show()

In [None]:
time_df['Volume'].hist(bins=50)
plt.title('Volume Distribution')
plt.show()

In [None]:
time_df['Close'].hist(bins=50)
plt.title('Day Closing Rate Distribution')
plt.show()

In [None]:
time_df['High'].hist(bins=50)
plt.title('Day Closing Rate Distribution')
plt.show()

* Stock open and close price show multimodal distribution.
  * Tree-based (XGBoost, Random Forest) and Hidden Markov Model might work better than ARIMA or LSTM that work on sequencial data.
  * Feature Engineering may benefit in sequencial based models
* Return and log of retun show normal distribution.
* Volume shows positively skewed distribution

## Exploration Relevant to Forecast

### Decomposition


In [None]:
decomp = seasonal_decompose(time_df['Close'], model='additive', period=21)
decomp.plot()
plt.suptitle('Seasonal Additive Decomposition')
plt.tight_layout()
plt.show()

In [None]:
decomp = seasonal_decompose(time_df['Close'], model='multiplicative', period=21)
decomp.plot()
plt.suptitle('Seasonal Multiplicative Decomposition')
plt.tight_layout()
plt.show()

In [None]:
decomp.resid.describe()

* additive decomposition has uncaptured information (functionation of ±20)
* multiplicative decomposition has small seasonal component (~1 unit in values of 350-500 units) can be weak/unhelpful.
Requires further seasonality validation, will be done by ACF/PACF.

In [None]:
# additive decomposition is stl - offers freedom to seasonal component
stl = STL(time_df['Close'], period=21, seasonal=101, robust=True)
result = stl.fit()

# Plot the decomposition
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8, 6))
ax1.plot(result.trend, label='Trend', color='red')
ax1.set_title('Trend Component')
ax2.plot(result.seasonal, label='Seasonal', color='blue')
ax2.set_title('Seasonal Component')
ax3.plot(result.resid, label='Residual', color='green')
ax3.set_title('Residual Component')
plt.suptitle('STL Decomposition of Close: Period ')
plt.tight_layout()
plt.show()

In [None]:
def plot_stl(period, seasonal):
    stl = STL(time_df['Close'], period=period, seasonal=seasonal, robust=True)
    result = stl.fit()

    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8, 6))

    ax1.plot(result.trend, color='red')
    ax1.set_title('Trend Component')

    ax2.plot(result.seasonal, color='blue')
    ax2.set_title('Seasonal Component')

    ax3.plot(result.resid, color='green')
    ax3.set_title('Residual Component')

    plt.suptitle(f'STL Decomposition on Close: Period: {period}, Seasona: {seasonal}')
    plt.tight_layout()
    plt.show()

# Slider from 3 to 71
interact(plot_stl, period=widgets.IntSlider(min=3, max=71, step=1, value=21),
         seasonal=widgets.IntSlider(min=7, max=151, step=2, value=21))

In [None]:
# multiplicative stl decomposition -
# as data has curve in trend, multiplicative decomposition might better
def plot_mstl(period, seasonal):
    stl = STL(np.log(time_df['Close']), period=period, seasonal=seasonal, robust=True)
    result = stl.fit()

    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8, 6))

    ax1.plot(result.trend, color='red')
    ax1.set_title('Trend Component')

    ax2.plot(result.seasonal, color='blue')
    ax2.set_title('Seasonal Component')

    ax3.plot(result.resid, color='green')
    ax3.set_title('Residual Component')

    plt.suptitle(f'STL Decomposition on Log(Close): Period: {period}, Seasona: {seasonal}')
    plt.tight_layout()
    plt.show()

# Slider from 3 to 71
interact(plot_mstl, period=widgets.IntSlider(min=3, max=71, step=1, value=21),
         seasonal=widgets.IntSlider(min=3, max=151, step=2, value=21))

yₜ be forcast of stock close rate.

Tₜ, Sₜ, Rₜ are component from STL decomposition of log data.

log(yₜ)=Tₜ+Sₜ+Rₜ

yₜ = exp(Tₜ+Sₜ+Rₜ)

yₜ = exp(Tₜ) . exp(Sₜ) . exp(Rₜ)

* The stl decomposition of series and log-series shows much weaker seasonality, than classic decomposition, indicates weak seasonal comp in series.
* Log decomp didn't improved the residual or seasonality; suggesting series behaves almost additively and **neglible multiplicative effect**.
* The residual component **dominates the comp** and exhibits spikes and **volatility**, is not a white noise; and imply irregular fluctuation drives the series more than any stable periodic pattern.
* ACF, PACF needed here also for checking further dependence and pattern.

[domain-info] some noises in stock data is normal, comes from daily news, events.

### Stationarity Check
many statistical models assume stationarity.

raw Close price is already non-stationary (has trend).

In [None]:
# weak stationary test
## augmented dickey fuller - diff based
## null hypothesis; non-stationary (series has root unit)
adf_rtn = adfuller(time_df['Return'].dropna())

print("Close p-value:", adf_close[1])
print("Return p-value:", adf_rtn[1])

print("\nClose adf stat:", adf_close[0])
print("Return adf stat:", adf_rtn[0])

print('\nCritical Values for Close:')
for key, value in adf_close[4].items():
  print(f'{key}: {round(value, 3)}', end="\t")

print('\nCritical Values for Return:')
for key, value in adf_rtn[4].items():
  print(f'{key}: {round(value, 3)}', end="\t")

In [None]:
# weak stationary test
## kpss test
## null hypothesis: stationary series
kpss_close = kpss(time_df['Close'], 'ct')
kpss_rtn = kpss(time_df['Return'].dropna(), 'ct')

print("Close p-value:", kpss_close[1])
print("Return p-value:", kpss_rtn[1])

print("\nClose adf stat:", kpss_close[0])
print("Return adf stat:", kpss_rtn[0])

print('\nCritical Values for Close:')
for key, value in kpss_close[3].items():
  print(f'{key}: {round(value, 3)}', end="\t")

print('\nCritical Values for Return:')
for key, value in kpss_rtn[3].items():
  print(f'{key}: {round(value, 3)}', end="\t")

* Close is non stationary in both ADF (p>0.05) and KPSS (p<0.05) test.
* Return can be almost stationary, as ADF suggest stationary (p and criteria < limit) but KPSS reject null hypothesis (p<0.05 & criteria is in-between values)

In [None]:
# strict stationary test on return
## ks test
## null hyphothesis: stationary
split = len(time_df['Return'].dropna()) // 2
stat, pvalue = ks_2samp(time_df['Return'].dropna()[0:split],
                        time_df['Return'].dropna()[split:-1])
print(round(pvalue, 3))

In [None]:
split = len(time_df['Close'].dropna()) // 2
stat, pvalue = ks_2samp(time_df['Close'].dropna()[0:split],
                        time_df['Close'].dropna()[split:-1])
print(round(pvalue, 3))

* Return is strict stationary, and Close is not stationary.

### Autocorrelation
correlation of the feature with its own past values

In [None]:
plot_acf(time_df["Close"].dropna(), lags=30)
plt.title("ACF: Do Close show seasonal spikes?")
plt.show()

In [None]:
plot_pacf(time_df["Close"].dropna(), lags=21)
plt.title("ACF: Do Close show seasonal spikes?")
plt.xtick
plt.show()

In [None]:
plot_acf(time_df["Return"].dropna(), lags=30)
plt.title("ACF: Do Returns Contain Memory?")
plt.show()

In [None]:
plot_pacf(time_df["Return"].dropna(), lags=30)
plt.title("PACF: Do Returns Contain Memory?")
plt.show()

* ACF of close shows gradual decay, series with trends and no spikes of seasonality.
* ACF of returns shows no significant autocorrelation, indicating the absence of trend, seasonality, or linear memory in the mean.