In [1]:
import pandas as pd

import src.paths as paths
from src.utils import clean_col_names

# Read the raw journal data from an Excel file, specifying all columns as strings
journal = pd.read_excel(io=paths.JOURNAL_RAW_FILE_PATH, dtype=str)

# Clean the column names of the journal dataframe using the clean_col_names function
journal.columns = clean_col_names(journal)

# Convert the columns "DEBIT" and "CREDIT" to float type
for col in ["DEBIT", "CREDIT"]:
    journal[col] = journal[col].astype(float)

# Convert the columns "DATE" and "ECHEANCE" to datetime type with the specified format
for col in ["DATE", "ECHEANCE"]:
    journal[col] = pd.to_datetime(journal[col], format="%d/%m/%Y")

# Create a new column "AAAAMM" by formatting the "DATE" column to "YYYYMM"
journal["AAAAMM"] = journal["DATE"].dt.strftime("%Y%m")

# Save the cleaned journal data to a parquet file
journal.to_parquet(paths.JOURNAL_STRUCTURED_FILE_PATH, index=False)

In [10]:
def load_data():
    data = pd.read_parquet(paths.CUBE_FILE_PATH)
    return data

In [15]:
load_data()["DATE_STR"].unique()

array(['2023-04-03', '2023-04-14', '2023-04-23', '2023-04-27',
       '2023-05-05', '2023-05-09', '2023-05-12', '2023-05-13',
       '2023-05-22', '2023-05-23', '2023-05-24', '2023-05-25',
       '2023-05-26', '2023-05-31', '2023-06-05', '2023-06-06',
       '2023-06-12', '2023-06-19', '2023-06-25', '2023-06-26',
       '2023-06-30', '2023-07-04', '2023-07-09', '2023-07-10',
       '2023-07-13', '2023-07-17', '2023-07-19', '2023-07-20',
       '2023-07-25', '2023-07-28', '2023-07-30', '2023-08-02',
       '2023-08-18', '2023-08-30', '2023-08-31', '2023-09-09',
       '2023-09-11', '2023-09-13', '2023-09-18', '2023-09-19',
       '2023-09-20', '2023-09-21', '2023-09-22', '2023-09-25',
       '2023-09-26', '2023-09-27', '2023-09-28', '2023-09-29',
       '2023-09-30', '2023-10-01', '2023-10-02', '2023-10-03',
       '2023-10-04', '2023-10-05', '2023-10-09', '2023-10-10',
       '2023-10-11', '2023-10-12', '2023-10-13', '2023-10-15',
       '2023-10-16', '2023-10-17', '2023-10-18', '2023-