# DATA5000 DGEA Deep Causal AI

## Data Engineering Economic Time Series Data

In [1]:
!pip install tsgm

Collecting tsgm
  Downloading tsgm-0.0.7.tar.gz (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting antropy==0.1.6 (from tsgm)
  Downloading antropy-0.1.6.tar.gz (17 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting yfinance==0.2.28 (from tsgm)
  Downloading yfinance-0.2.28-py2.py3-none-any.whl.metadata (11 kB)
Collecting dtaidistance>=2.3.10 (from tsgm)
  Downloading dtaidistance-2.3.12.tar.gz (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requireme

In [2]:
!pip install openpyxl



In [6]:
!pip install pandas numpy matplotlib keras



In [7]:
import numpy as np
import pandas as pd
import openpyxl

import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow as tf

import tsgm

# List of all DGEA worksheets - in one Excel file
## We load all sheets at once and then retrieve each sheet from a dictionary.

In [8]:
dgea_files = ['AUS_CPI_QUARTER',
              'AUS_CPI_MONTHLY',
              'AUS_MONEY',
              'AUS_CASHRATE',
              'AUS_UNEMPLOY_PARTICPATION_RATE',
              'AUS_EMPLOY_TOPOP',
              'REALGDP',
              'AUS_GNI',
              'GOLDYAHOO',
              'CRUDEOIL',
              'IRONORE',
              'SILVER',
              'COPPER',
              'USDEBT-MARKETVALUE',
              'US-TREASURY-DEBT',
              'USMONEYCIRCULATE',
              'USD_BROADMONEY_CIRCULATION',
              'US_CPI_MONTHLY',
              'FEDFUNDRATE',
              'GLOBAL-DEBT-GDP']

In [10]:
len(dgea_files)

20

## Load the entire DGEA Excel Workbook - all sheets

In [13]:
# load excel workbook containing multiple sheets
dgea = pd.read_excel('https://s3.ap-southeast-2.wasabisys.com/data5000/DGEA_ECONOMICS_DATASET2023v1.xlsx', sheet_name=dgea_files)

HTTPError: HTTP Error 404: Not Found

## Individual dataframes for each worksheet

# Data Engineering 101: control the format of Dates and Times

In [14]:
aus_cpi_quarter = dgea['AUS_CPI_QUARTER']
aus_cpi_quarter

NameError: name 'dgea' is not defined

In [15]:
# get info - note the datatypes
aus_cpi_quarter.info()

NameError: name 'aus_cpi_quarter' is not defined

In [16]:
# make datetime index
aus_cpi_quarter = aus_cpi_quarter.set_index(aus_cpi_quarter['Datetime'])
aus_cpi_quarter

NameError: name 'aus_cpi_quarter' is not defined

In [17]:
# we can now drop the Datetime column - since it is now the index into each observations
aus_cpi_quarter.drop(columns='Datetime', inplace=True)
aus_cpi_quarter

NameError: name 'aus_cpi_quarter' is not defined

# The following examples show how to change the format of the datetimes

In [None]:
from datetime import datetime

# Input date in YYYY-MM-DD format
input_date_str = "2023-10-24"

# Convert the input date string to a datetime object
input_date = datetime.strptime(input_date_str, "%Y-%m-%d")

# Format the date as DD-MM-YYYY
output_date_str = input_date.strftime("%d-%m-%Y")

print("Input Date (YYYY-MM-DD):", input_date_str)
print("Output Date (DD-MM-YYYY):", output_date_str)

# And if Datetime is an index to observations

In [None]:
import pandas as pd

# Sample DataFrame with dates as the index
data = {'value': [10, 20, 30, 40],
        'date': ['2023-01-15', '2023-02-20', '2023-03-25', '2023-04-30']}

df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])  # Convert the 'date' column to datetime format
df.set_index('date', inplace=True)  # Set the 'date' column as the index

# Transform the date format from YYYY-MM-DD to DD-MM-YYYY
df.index = df.index.strftime('%d-%m-%Y')

# Print the transformed DataFrame
df

In [None]:
# Transform the date format from YYYY-MM-DD to DD-MM-YYYY
# aus_cpi_quarter.index = aus_cpi_quarter.index.strftime('%d-%m-%Y')
# aus_cpi_quarter

In [None]:
aus_cpi_quarter.index

## Now that datetime is the index to all observed events (and their measures or values), we can create a window into the period of interest.

In [None]:
start_date = '2021-12-31'
end_date = '2024-03-31'

In [None]:
temp = aus_cpi_quarter.loc[start_date:end_date]
temp

In [None]:
temp2 = aus_cpi_quarter.loc[aus_cpi_quarter.index > '2000']
temp2

# Australian Inflation Rate (Monthly)

In [None]:
aus_cpi_monthly = dgea['AUS_CPI_MONTHLY']
aus_cpi_monthly

In [None]:
aus_cpi_monthly.info()

In [None]:
aus_cpi_monthly = aus_cpi_monthly.set_index(aus_cpi_monthly['Datetime'], )
aus_cpi_monthly.drop(columns='Datetime', inplace=True)
aus_cpi_monthly

In [None]:
aus_cpi_monthly[start_date:end_date]

In [None]:
aus_money = dgea['AUS_MONEY']
aus_money

In [None]:
aus_money.info()

In [None]:
aus_money = aus_money.set_index(aus_money['Datetime'])
aus_money.drop(columns='Datetime', inplace=True)
aus_money

In [None]:
aus_money[start_date:end_date]

# Cash Rate Target (Quarterly)

In [None]:
aus_cashrate = dgea['AUS_CASHRATE']
aus_cashrate = aus_cashrate.set_index(aus_cashrate['Datetime'])
aus_cashrate = aus_cashrate.drop(columns='Datetime')
aus_cashrate

In [None]:
aus_cashrate[start_date:end_date]

# Australian Employment-to-Population Percentage

In [None]:
aus_employ_topop = dgea['AUS_EMPLOY_TOPOP']
aus_employ_topop = aus_employ_topop.set_index(aus_employ_topop['Datetime'])
aus_employ_topop = aus_employ_topop.drop(columns='Datetime')
aus_employ_topop

In [None]:
aus_gni = dgea['AUS_GNI']
aus_gni

In [None]:
aus_gni.info()

## For Australian GNI - note that the Datetime is an integer (number) and not yet a datetime object.

In [None]:
aus_gni['Datetime'] = pd.to_datetime(aus_gni['Datetime'], format='%Y', dayfirst = False)
aus_gni['Datetime']

In [None]:
aus_gni = aus_gni.set_index(aus_gni['Datetime'])
aus_gni.drop(columns='Datetime', inplace=True)
aus_gni

In [None]:
aus_gni['2000':'2021-01-01']

In [None]:
aus_unemploy_participation_rate = dgea['AUS_UNEMPLOY_PARTICPATION_RATE']
aus_unemploy_participation_rate = aus_unemploy_participation_rate.set_index(aus_unemploy_participation_rate['Datetime'])
aus_unemploy_participation_rate.drop(columns='Datetime', inplace=True)
aus_unemploy_participation_rate

In [None]:
aus_unemploy_participation_rate[start_date:end_date]

In [None]:
copper = dgea['COPPER']
copper = copper.set_index(copper['Date'])
copper.drop(columns='Date', inplace=True)
copper[start_date:end_date]

In [None]:
crude = dgea['CRUDEOIL']
crude

In [None]:
crude.info()

In [None]:
crude = dgea['CRUDEOIL']
crude = crude.set_index(crude['Date'])
crude.drop(columns='Date', inplace=True)

In [None]:
crude[start_date:end_date]

In [None]:
real_gdp = dgea['REALGDP']
real_gdp

In [None]:
real_gdp.info()

In [None]:
real_gdp['Datetime'] = pd.to_datetime(real_gdp['Datetime'], format='%Y', dayfirst = False)
real_gdp['Datetime']

In [None]:
real_gdp = real_gdp.set_index(real_gdp['Datetime'])
real_gdp.drop(columns='Datetime', inplace=True)
real_gdp

In [None]:
silver = dgea['SILVER']
silver

In [None]:
silver = silver.set_index(silver['Date'])
silver.drop(columns='Date', inplace=True)
silver

In [None]:
gold = dgea['GOLDYAHOO']
gold

In [None]:
gold.info()

In [None]:
# Convert the 'date strings' column to datetime
gold['Date'] = pd.to_datetime(gold['Date'], format='%b %d, %Y')
gold['Date']

In [None]:
gold = gold.set_index(gold['Date'])
gold.drop(columns='Date', inplace=True)
gold

In [None]:
gold.info()

In [None]:
gold.index

## The following are examples on how to merge dataframes together

In [None]:
import pandas as pd

# Sample DataFrames with datetime indices
data1 = {'value1': [10, 20, 30, 40], 'value2': [100, 200, 300, 400]}
data2 = {'value3': [50, 60, 70, 80], 'value4': [500, 600, 700, 800]}
index = pd.to_datetime(['2023-10-01', '2023-10-02', '2023-10-03', '2023-10-04'])

df1 = pd.DataFrame(data1, index=index)
df2 = pd.DataFrame(data2, index=index)

# Merge the DataFrames with 'outer' join to include all dates
merged_df = df1.merge(df2, left_index=True, right_index=True, how='outer')

print(merged_df)

In [None]:
import pandas as pd

# Sample DataFrames with datetime indices
data1 = {'value1': [10, 20, 30, 40], 'value2': [100, 200, 300, 400]}
data2 = {'value3': [50, 60, 70, 80], 'value4': [500, 600, 700, 800]}
index1 = pd.to_datetime(['2023-10-01', '2023-10-02', '2023-10-03', '2023-10-04'])
index2 = pd.to_datetime(['2023-10-02', '2023-10-03', '2023-10-04', '2023-10-05'])

df1 = pd.DataFrame(data1, index=index1)
df2 = pd.DataFrame(data2, index=index2)

# Merge the DataFrames with 'inner' join to include only the common dates
merged_df = df1.merge(df2, left_index=True, right_index=True, how='inner')

print(merged_df)

# Combine Dataframes

In [None]:
import pandas as pd

# Sample DataFrames with datetime indices
data1 = {'value1': [10, 20, 30, 40], 'value2': [100, 200, 300, 400]}
data2 = {'value3': [50, 60, 70, 80], 'value4': [500, 600, 700, 800]}
index1 = pd.to_datetime(['2023-10-01', '2023-10-02', '2023-10-03', '2023-10-04'])
index2 = pd.to_datetime(['2023-10-02', '2023-10-03', '2023-10-04', '2023-10-05'])

df1 = pd.DataFrame(data1, index=index1)
df2 = pd.DataFrame(data2, index=index2)

# Merge the DataFrames and fill missing data with zeros
merged_df = df1.combine_first(df2).fillna(0)

print(merged_df)

## Lets continue with data transformation

In [None]:
fedsfund = dgea['FEDFUNDRATE']
fedsfund = fedsfund.set_index(fedsfund['Datetime'])
fedsfund.drop(columns='Datetime', inplace=True)

In [None]:
fedsfund[start_date:end_date]

In [None]:
global_debt_gdp = dgea['GLOBAL-DEBT-GDP']
global_debt_gdp

In [None]:
global_debt_gdp.info()

## Global Debt to GDP - we will not use this; However, you may want to include one or two for Assessment #3.

In [None]:
import datetime

global_debt_gdp['Datetime'] = pd.to_datetime(global_debt_gdp['Datetime'], dayfirst=False, format='%Y')
global_debt_gdp

In [None]:
global_debt_gdp = global_debt_gdp.set_index(global_debt_gdp['Datetime'])
global_debt_gdp.drop(columns='Datetime', inplace=True)

In [None]:
global_debt_gdp

In [None]:
global_debt_gdp[start_date:end_date]

In [None]:
ironore = dgea['IRONORE']
ironore = ironore.set_index(ironore['Date'])
ironore.drop(columns='Date', inplace=True)
ironore[start_date:end_date]

In [None]:
real_gdp[start_date:end_date]

In [None]:
us_treasury_debt = dgea['US-TREASURY-DEBT']
us_treasury_debt

In [None]:
us_treasury_debt.info()

In [None]:
us_treasury_debt = us_treasury_debt.set_index(us_treasury_debt['Datetime'])
us_treasury_debt.drop(columns='Datetime', inplace=True)
us_treasury_debt[start_date:end_date]

In [None]:
us_cpi_monthly = dgea['US_CPI_MONTHLY']
us_cpi_monthly

In [None]:
us_cpi_monthly.info()

In [None]:
us_cpi_monthly = us_cpi_monthly.set_index(us_cpi_monthly['Month'])
us_cpi_monthly.drop(columns='Month', inplace=True)
us_cpi_monthly[start_date:end_date]

In [None]:
us_broadmoney = dgea['USD_BROADMONEY_CIRCULATION']
us_broadmoney

In [None]:
us_broadmoney.info()

In [None]:
us_broadmoney = us_broadmoney.set_index(us_broadmoney['Datetime'])
us_broadmoney.drop(columns='Datetime', inplace=True)
us_broadmoney[start_date:end_date]

In [None]:
us_debt = dgea['USDEBT-MARKETVALUE']
us_debt

In [None]:
us_debt.info()

In [None]:
us_debt = us_debt.set_index(us_debt['Datetime'])
us_debt.drop(columns='Datetime', inplace=True)
us_debt[start_date:end_date]

# Merge dataframes

## Combine dataframes together. If the dates do not match, then insert the date as another row, but if there no data from one of the dataframe, fill with NaN.

## Later, we shall use Generative AI to Augment the missing "NaN" data.

In [None]:
aus_cpi_monthly[start_date:end_date]

In [None]:
aus_money[start_date:end_date]

# Combine time series

# At this point, we are starting to bring each time series within a time window and into alignment

In [None]:
import pandas as pd

# Merge the DataFrames and fill missing data with empty or NaNs
merged_df = aus_cpi_monthly.combine_first(aus_money).fillna('NA')
merged_df

In [None]:
aus_cpi_monthly['AUS_CPI_MONTHLY']

In [None]:
merged_df[start_date:end_date]

In [None]:
aus_cashrate

In [None]:
merged_df = merged_df.combine_first(aus_cashrate).fillna('NA')
merged_df

In [None]:
merged_df[start_date:end_date]

In [None]:
aus_employ_topop

In [None]:
merged_df = merged_df.combine_first(aus_employ_topop).fillna('NA')
merged_df

In [None]:
merged_df[start_date:end_date]

In [None]:
aus_gni

In [None]:
merged_df = merged_df.combine_first(aus_gni).fillna('NA')
merged_df

In [None]:
merged_df[start_date:end_date]

In [None]:
aus_unemploy_participation_rate

In [None]:
merged_df = merged_df.combine_first(aus_unemploy_participation_rate).fillna('NA')
merged_df

In [None]:
merged_df[start_date:end_date]

In [None]:
copper

In [None]:
merged_df = merged_df.combine_first(copper).fillna('NA')
merged_df

In [None]:
crude

In [None]:
merged_df = merged_df.combine_first(crude).fillna('NA')
merged_df

In [None]:
fedsfund

In [None]:
merged_df = merged_df.combine_first(fedsfund).fillna('NA')
merged_df

In [None]:
ironore

In [None]:
merged_df = merged_df.combine_first(ironore).fillna('NA')
merged_df[start_date:end_date]

In [None]:
merged_df.columns.to_list()

In [None]:
merged_df[start_date:end_date]

In [None]:
gold

In [None]:
merged_df = merged_df.combine_first(gold).fillna('NA')
merged_df[start_date:end_date]

In [None]:
merged_df.info()

In [None]:
silver

In [None]:
merged_df = merged_df.combine_first(silver).fillna('NA')
merged_df[start_date:end_date]

In [None]:
merged_df.info()

In [None]:
merged_df.to_csv('dgea_economic_timeseries_all_dates.csv', header=True, index=True, index_label='Datetime')

# Data Engineering - missing values strategies

## We need to adopt a strategy, and there can be many, on how to deal with missing values.

## One such strategy is to "forward-fill" i.e., use the current value as the next datetime observation if that observation has no value.

## Alternatively, "backward-fill" could be applied.

## Our approach: we apply GANs and VAEs

In [None]:
!pip install tsgm

In [None]:
!pip install pandas numpy matplotlib keras openpyxl

In [None]:
import numpy as np
import pandas as pd
import openpyxl
import random

import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow as tf

import tsgm

In [None]:
dgea_timeseries_file_excel = 'https://s3.ap-southeast-2.wasabisys.com/data5000/dgea_economic_timeseries_all_dates.xlsx'

In [None]:
dgea_timeseries_file = 'https://s3.ap-southeast-2.wasabisys.com/data5000/dgea_economic_timeseries_all_dates.csv'

In [None]:
dgea = pd.read_csv(dgea_timeseries_file)
dgea

# Data Augmentation and Handling of Missing Timeseries Observations

## First - lets see some basic examples

## The library that we are using is called "TSGM", which is short for Time Series Generative Modelling (see: https://tsgm.readthedocs.io/en/latest/)

In [None]:
# generate a time series - sine waves
# parameters: 100 sine waves (time series), with each having a length of time of 64
# For each time series, there are two features.
# The maximum value, or the peak, of each sine wave is 20

X = tsgm.utils.gen_sine_dataset(100, 64, 2, max_value=20)
X

In [None]:
len(X)

In [None]:
X.shape

## Gaussian Noise

In [None]:
aug_model = tsgm.models.augmentations.GaussianNoise()
samples = aug_model.generate(X=X, n_samples=10, variance=0.2)
samples

## Helper functions to plot time series

In [None]:
# function to plot
def plot(timesteps, X, plot_id=1, title="Original Time Series"):
    plt.subplot(1, 2, plot_id)

    for sample_id in range(X.shape[0]):
        plt.plot(timesteps, X[sample_id, :, 0], label=f"Sample #{sample_id}")

    plt.title(title)
    plt.legend()

    #plt.figure(figsize=(20, 20))


In [None]:
def plot_samples_aug(X, Xaug, n_samples=5):
    timesteps = np.arange(0, X.shape[1], 1)
    sample_ids = random.sample(range(X.shape[0]), n_samples)
    sample_ids_aug = random.sample(range(Xaug.shape[0]), n_samples)

    plot(timesteps, X[sample_ids])
    plot(timesteps, Xaug[sample_ids_aug], title="Augmented Time Series", plot_id=2)

    plt.figure(figsize=(20, 20))
    plt.show()

In [None]:
plot_samples_aug(X, samples)

## Shuffling

In [None]:
aug_model = tsgm.models.augmentations.Shuffle()
shuffle_samples = aug_model.generate(X=X, n_samples=3)
shuffle_samples

## Magnitude Warping

In [None]:
aug_model = tsgm.models.augmentations.MagnitudeWarping()
warp_mag_samples = aug_model.generate(X=X, n_samples=10, sigma=1)
warp_mag_samples

In [None]:
plot_samples_aug(X, warp_mag_samples)

## Dynamic Time Warping (Barycentric Average)

In [None]:
aug_model = tsgm.models.augmentations.DTWBarycentricAveraging()


initial_timeseries = random.sample(range(X.shape[0]), 10)
initial_timeseries = X[initial_timeseries]


time_warp_samples = aug_model.generate(X=X, n_samples=10, initial_timeseries=initial_timeseries)
time_warp_samples

In [None]:
plot_samples_aug(X, time_warp_samples)

## Generative Models - VAEs

In [None]:
# number of time series, length of time, number of features
n, n_ts, n_features = 1000, 24, 5

# create toy sine wave time series as before
data = tsgm.utils.gen_sine_dataset(n, n_ts, n_features)

# but scale the values
scaler = tsgm.utils.TSFeatureWiseScaler()

scaled_data = scaler.fit_transform(data)

# our toy sine data
scaled_data

In [None]:
scaled_data.shape

## Now we can use Generative Models to "FIT" our data
## Then we can generate samples

In [None]:
# as before we have 1000 sine waves, each of length 24, and each has 5 features
n, n_ts, n_features  = 1000, 24, 5


data = tsgm.utils.gen_sine_dataset(n, n_ts, n_features)
scaler = tsgm.utils.TSFeatureWiseScaler()
scaled_data = scaler.fit_transform(data)

# specify generative architecture
architecture = tsgm.models.zoo["vae_conv5"](n_ts, n_features, 10)

# encoder-decoder
encoder, decoder = architecture.encoder, architecture.decoder

# VAE
vae = tsgm.models.cvae.BetaVAE(encoder, decoder)

# optimiser
vae.compile(optimizer=keras.optimizers.Adam())

# fit on our toy sine wave data with scaled values [-1,1]
vae.fit(scaled_data, epochs=10, batch_size=64)

# generate samples
vae_samples = vae.generate(10)

In [None]:
# get the first tensor
vae_samples[0]

In [None]:
len(vae_samples[0])

In [None]:
# this is a tensorflow tensor (works the same in Pytorch)
vae_samples

In [None]:
# extract the array that is inside the tensor
vae_array = vae_samples.numpy()
vae_array

In [None]:
# there are 10 time series, each of length 24, and has 5 features
vae_array.shape

In [None]:
# get the first time series
vae_array[0]

In [None]:
# time series has a length of 24 and 5 features
vae_array[0].shape

In [None]:
# function to plot
def plot_vae(timesteps, X, plot_id=1, title="Original Time Series"):
    plt.subplot(1, 2, plot_id)

    for sample_id in range(X.shape[0]):
        plt.plot(timesteps, X[sample_id, :, 0], label=f"Sample #{sample_id}")

    plt.title(title)
    plt.legend()

In [None]:
def plot_vae_aug(X, Xaug, n_samples=5):

    timesteps = np.arange(0, X.shape[1], 1)

    sample_ids = random.sample(range(X.shape[0]), n_samples)
    sample_ids_aug = random.sample(range(Xaug.shape[0]), n_samples)

    plot_vae(timesteps, X[sample_ids])
    plot_vae(timesteps, Xaug[sample_ids_aug], title="VAE Time Series", plot_id=2)
    plt.show()

In [None]:
plot_vae_aug(scaled_data, vae_array)

# Generative Models on DGEA Dataset

In [None]:
dgea_sheets = ['AUS_CPI_QUARTER',
              'AUS_CPI_MONTHLY',
              'AUS_MONEY',
              'AUS_CASHRATE',
              'AUS_UNEMPLOY_PARTICPATION_RATE',
              'AUS_EMPLOY_TOPOP',
              'REALGDP',
              'AUS_GNI',
              'GOLDYAHOO',
              'CRUDEOIL',
              'IRONORE',
              'SILVER',
              'COPPER',
              'USDEBT-MARKETVALUE',
              'US-TREASURY-DEBT',
              'USMONEYCIRCULATE',
              'USD_BROADMONEY_CIRCULATION',
              'US_CPI_MONTHLY',
              'FEDFUNDRATE',
              'GLOBAL-DEBT-GDP']

In [None]:
# load excel workbook containing multiple sheets
dgea_original_timeseries = pd.read_excel('https://s3.ap-southeast-2.wasabisys.com/data5000/DGEA_ECONOMICS_DATASET2023v1.xlsx', sheet_name=dgea_sheets)

In [None]:
dgea_original_timeseries['AUS_CPI_MONTHLY']

# Apply Generative Model

In [None]:
max_time = len(dgea_original_timeseries['AUS_CPI_MONTHLY'])
max_time

In [None]:
# we create 1 time series with max time and only a single feature
n, n_ts, n_features  = 1, max_time, 1


data = tsgm.utils.gen_sine_dataset(n, n_ts, n_features)
data

In [None]:
data.shape

In [None]:
scaler = tsgm.utils.TSFeatureWiseScaler()
scaled_data = scaler.fit_transform(data)
scaled_data

In [None]:
scaled_data.shape

In [None]:
# specify generative architecture
architecture = tsgm.models.zoo["vae_conv5"](n_ts, n_features, 10)

# encoder-decoder
encoder, decoder = architecture.encoder, architecture.decoder

# VAE
vae = tsgm.models.cvae.BetaVAE(encoder, decoder)

# optimiser
vae.compile(optimizer=keras.optimizers.Adam())

In [None]:
# fit on our toy sine wave data with scaled values [-1,1]
vae.fit(scaled_data, epochs=10, batch_size=64)

In [None]:
# generate samples
vae_samples = vae.generate(10)
vae_samples

# Using VAE for Australian CPI (Monthly)

In [None]:
auscpi_month = dgea_original_timeseries['AUS_CPI_MONTHLY']
auscpi_month

In [None]:
auscpi_month['AUS_CPI_MONTHLY']

In [None]:
# we create 1 time series with max time and only a single feature
n, n_ts, n_features  = 1, max_time, 1

## How to reshape tensors and arrays

In [None]:
import pandas as pd
import numpy as np

# Create a sample Pandas Series
data = pd.Series([1, 2, 3, 4, 5])

# Convert the Pandas Series to a NumPy array
numpy_array = data.values

# Reshape the NumPy array
numpy_array_reshaped = numpy_array.reshape(1, len(data), 1)

print(numpy_array_reshaped)

In [None]:
cpi_array = auscpi_month['AUS_CPI_MONTHLY'].values
cpi_array

In [None]:
len(cpi_array)

In [None]:
# Reshape the NumPy array
cpi_array_reshaped = cpi_array.reshape(n, n_ts, n_features)
cpi_array_reshaped

In [None]:
cpi_scaler = tsgm.utils.TSFeatureWiseScaler((-1,1))

cpi_scaled_data = cpi_scaler.fit_transform(cpi_array_reshaped)

cpi_scaled_data

In [None]:
# specify generative architecture
architecture = tsgm.models.zoo["vae_conv5"](n_ts, n_features, 10)

# encoder-decoder
encoder, decoder = architecture.encoder, architecture.decoder

# VAE
vae = tsgm.models.cvae.BetaVAE(encoder, decoder)

# optimiser
vae.compile(optimizer=keras.optimizers.Adam())

In [None]:
# fit on our data
vae.fit(cpi_scaled_data, epochs=10, batch_size=64)

In [None]:
# generate samples
vae_samples = vae.generate(1)
vae_samples

# Missing Values Strategies

In [None]:
dgea_sheets = ['AUS_CPI_QUARTER',
              'AUS_CPI_MONTHLY',
              'AUS_MONEY',
              'AUS_CASHRATE',
              'AUS_UNEMPLOY_PARTICPATION_RATE',
              'AUS_EMPLOY_TOPOP',
              'REALGDP',
              'AUS_GNI',
              'GOLDYAHOO',
              'CRUDEOIL',
              'IRONORE',
              'SILVER',
              'COPPER',
              'USDEBT-MARKETVALUE',
              'US-TREASURY-DEBT',
              'USMONEYCIRCULATE',
              'USD_BROADMONEY_CIRCULATION',
              'US_CPI_MONTHLY',
              'FEDFUNDRATE',
              'GLOBAL-DEBT-GDP']

In [None]:
# load excel workbook containing multiple sheets
dgea_original_timeseries = pd.read_excel('https://s3.ap-southeast-2.wasabisys.com/data5000/DGEA_ECONOMICS_DATASET2023v1.xlsx', sheet_name=dgea_sheets)

In [None]:
dgea_timeseries_file = 'https://s3.ap-southeast-2.wasabisys.com/data5000/dgea_economic_timeseries_all_dates.csv'

In [None]:
# lets look at our merged dataset again
dgea = pd.read_csv(dgea_timeseries_file)
dgea

In [None]:
dgea.columns

In [None]:
# set datetime index
dgea = dgea.set_index(dgea['Datetime'])
dgea.drop(columns='Datetime', inplace=True)
dgea

# Remember what the original time series frequency was?

## For example, AUS_CPI_MONTHLY: we only have data going back to 2019-08-01. That means that we have only 49 observations.

## So what do we do?

In [None]:
# original
dgea_original_timeseries['AUS_CPI_MONTHLY']

## However, when we combine with all the other time series, we can see that the earliest datetime (for another time series) reaches all the way back to 1954! What do we do in this instance?

In [None]:
dgea['AUS_CPI_MONTHLY']