----
# Data Splitting
----

Doing train test split for raw clean data and stationary data

baseline forecasting needs raw data while more complex forecasting methods like ARIMA require stationary data.

## Set Up
----

In [1]:
import numpy as np
import pandas as pd

# plotting
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns

# stats
from statsmodels.api import tsa # time series analysis
import statsmodels.api as sm

## Raw Daily Data
---

In [3]:
raw_data = pd.read_csv('../../data/daily_data_clean.csv', index_col=0)

In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1828 entries, 2019-07-29 to 2024-07-29
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       1828 non-null   float64
 1   High       1828 non-null   float64
 2   Low        1828 non-null   float64
 3   Close      1828 non-null   float64
 4   Adj Close  1828 non-null   float64
 5   Volume     1828 non-null   float64
dtypes: float64(6)
memory usage: 100.0+ KB


In [6]:
# 4 years to be training data, 1 year to be validation data
train_raw_data = raw_data.loc[raw_data.index <= "2023-07-29"]
val_raw_data = raw_data.loc[raw_data.index > "2023-07-30"]

In [8]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_raw_data.index, y=train_raw_data['Adj Close'], mode='lines', name="Training Set"))
fig.add_trace(go.Scatter(x=val_raw_data.index, y=val_raw_data['Adj Close'], mode='lines', name="Validation Set"))
fig.update_layout(
    yaxis_title="Adj Close", 
    xaxis_title="Date",
    title="Microsoft Stock Price Trends: Adjusted Close Prices"
)
fig.show()

In [7]:
train_raw_data.to_csv('../../data/train_raw.csv')
val_raw_data.to_csv('../../data/val_raw.csv')

## Stationary Daily Data
---

In [9]:
stationary_data = pd.read_csv('../../data/daily_data_stationary.csv', index_col=0)

In [10]:
stationary_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1818 entries, 2019-08-05 to 2024-07-26
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Open                 1818 non-null   float64
 1   High                 1818 non-null   float64
 2   Low                  1818 non-null   float64
 3   Close                1818 non-null   float64
 4   Adj Close            1818 non-null   float64
 5   Volume               1818 non-null   float64
 6   Trend                1818 non-null   float64
 7   Seasonal             1818 non-null   float64
 8   Residual             1818 non-null   float64
 9   seasonal_difference  1818 non-null   float64
dtypes: float64(10)
memory usage: 156.2+ KB


In [12]:
# 4 years to be training data, 1 year to be validation data
train_stat_data = stationary_data.loc[stationary_data.index <= "2023-07-29", "seasonal_difference"]
val_stat_data = stationary_data.loc[stationary_data.index > "2023-07-30", "seasonal_difference"]

In [14]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_stat_data.index, y=train_stat_data, mode='lines', name="Training Set"))
fig.add_trace(go.Scatter(x=val_stat_data.index, y=val_stat_data, mode='lines', name="Validation Set"))
fig.update_layout(
    yaxis_title="Seasonal Difference", 
    xaxis_title="Date",
    title="Microsoft Stock Price Trends: Adjusted Close Prices"
)
fig.show()

In [15]:
train_stat_data.to_csv('../../data/train_stat.csv')
val_stat_data.to_csv('../../data/val_stat.csv')

TODO'S:

- INTRO/CONC
- COMMENTS
