In [6]:
import torch
import torch.nn as nn
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as pt
import plotly.graph_objs as go
from plotly.offline import iplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import typing
from datetime import datetime

device = "cpu"

In [7]:
df = pd.read_csv("data.csv")
df.columns = ["Date", "Course"]
df = df.set_index('Date')
df.index = pd.to_datetime(df.index)
if not df.index.is_monotonic_increasing:
    df = df.sort_index()
df.dropna(axis=0, how="any", inplace=True)
print(df)

             Course
Date               
2015-01-01  56.2376
2015-01-13  62.7363
2015-01-14  64.8425
2015-01-15  66.0983
2015-01-16  64.8337
...             ...
2024-02-06  91.2434
2024-02-07  90.6842
2024-02-08  91.1514
2024-02-09  91.2561
2024-02-10  90.8901

[2237 rows x 1 columns]


In [9]:
def lags(df: DataFrame, n_lags: int) -> DataFrame:
    dfc = df.copy()
    for i in range(1, n_lags + 1):
        dfc[f"lag{i}"] = dfc["Course"].shift(i)
    dfc = dfc.iloc[n_lags:]
    return dfc


input_dim = 100

df_lags = lags(df, input_dim)
print(df_lags)

             Course     lag1     lag2     lag3     lag4     lag5     lag6  \
Date                                                                        
2015-06-06  56.2463  54.9908  53.0590  53.4413  52.8213  52.9716  52.2907   
2015-06-09  56.0435  56.2463  54.9908  53.0590  53.4413  52.8213  52.9716   
2015-06-10  55.9100  56.0435  56.2463  54.9908  53.0590  53.4413  52.8213   
2015-06-11  54.8219  55.9100  56.0435  56.2463  54.9908  53.0590  53.4413   
2015-06-12  54.5285  54.8219  55.9100  56.0435  56.2463  54.9908  53.0590   
...             ...      ...      ...      ...      ...      ...      ...   
2024-02-06  91.2434  90.6626  90.2299  89.6678  89.2887  89.6090  89.5159   
2024-02-07  90.6842  91.2434  90.6626  90.2299  89.6678  89.2887  89.6090   
2024-02-08  91.1514  90.6842  91.2434  90.6626  90.2299  89.6678  89.2887   
2024-02-09  91.2561  91.1514  90.6842  91.2434  90.6626  90.2299  89.6678   
2024-02-10  90.8901  91.2561  91.1514  90.6842  91.2434  90.6626  90.2299   


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [11]:
def feature_label_split(df: DataFrame, target_col: str) -> tuple[DataFrame]:
    """
        Separation of validation and training sets
    """
    y = df[[target_col]]
    X = df.drop(columns=[target_col])
    return X, y


def train_val_test_split(df: pd.DataFrame, target_col: str, test_ratio: float) -> tuple[pd.DataFrame]:
    """
        Separation of validation and training sets
    """
    val_ratio = test_ratio / (1 - test_ratio)
    X, y = feature_label_split(df, target_col)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_ratio, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=val_ratio, shuffle=False)
    return X_train, X_val, X_test, y_train, y_val, y_test


X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    df_lags, 'Course', 0.2)
print(len(X_train))
print(len(X_test))
print(len(X_val))

1281
428
428
