In [1]:
import os
from pathlib import Path

from hydra import initialize_config_dir, compose

with initialize_config_dir(version_base=None, config_dir=str(Path(os.get_exec_path()[0]).parent.parent / 'config')):

    cfg=compose(overrides= ["+models=model_1"])
    cfg = cfg['models']

    LOAD_DATA_PATH = cfg['load']['LOAD_DATA_PATH']
    LOAD_DATA_FILE = cfg['load']['LOAD_DATA_FILE']

    TRANSFORM_DATA_PATH = cfg['transform']['TRANSFORM_DATA_PATH']

    X_TRAIN_FILE = cfg['transform']['TRANSFORM_DATA_FILE_TRAIN_X']
    Y_TRAIN_FILE =  cfg['transform']['TRANSFORM_DATA_FILE_TRAIN_Y']
    X_TEST_FILE = cfg['transform']['TRANSFORM_DATA_FILE_TEST_X']
    Y_TEST_FILE =  cfg['transform']['TRANSFORM_DATA_FILE_TEST_Y']

In [2]:
import os
from pathlib import Path
from sklearn.model_selection import train_test_split

import pyarrow as pa
import pyarrow.parquet as pq

RAW_DATA = str(Path(LOAD_DATA_PATH) / LOAD_DATA_FILE)

data = pq.read_table(RAW_DATA).to_pandas()
data['Date'] = data.index
data.reset_index(drop=True, inplace=True)


data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

features = ['Open', 'High', 'Low', 'Volume', 'Year', 'Month', 'Day']
target = 'Close'

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
import pyarrow as pa
import pyarrow.parquet as pq

p_data = pa.Table.from_pandas(X_train)
pq.write_table(p_data, Path(TRANSFORM_DATA_PATH) / X_TRAIN_FILE)

p_data = pa.Table.from_pandas(y_train.to_frame())
pq.write_table(p_data, Path(TRANSFORM_DATA_PATH) / Y_TRAIN_FILE)

p_data = pa.Table.from_pandas(X_test)
pq.write_table(p_data, Path(TRANSFORM_DATA_PATH) / X_TEST_FILE)

p_data = pa.Table.from_pandas(y_test.to_frame())
pq.write_table(p_data, Path(TRANSFORM_DATA_PATH) / Y_TEST_FILE)