# Data Preprocessing For Nowcasting Thai GDP

This notebook details the steps involved in preprocessing the collected data. The objective of this stage is to transform the raw datasets obtained from the `dataset-collection-thai-gdp-nowcasting.ipynb` notebook into a clean, consistent and suitable format for subsequent modeling. All details on the preprocessing steps can be found in the data chapter of the accompanying thesis.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np

In [2]:
# Load datasets
bot_set_df = pd.read_csv("./set_bot_dataset_final.csv")
gdp_df = pd.read_csv("./thai-gdp-quarterly-2011Q1-2019Q4.csv")

## Create Pseudo Real-Time Dataset

In [3]:
# Define helper function to simulate a real-time dataset
def get_real_time_dataset(df, cutoff_date="2020-01-31"):
    """
    Simulate a real-time dataset by removing values whose release date is after a given cutoff.

    Parameters
    ----------
    df (pd.DataFrame): Input dataframe with 'release_date' and 'value' columns.
    cutoff_date (str or datetime): The date before which data is considered known.

    Returns
    -------
    pd.DataFrame: Real-time simulated dataframe with a new 'value_real_time' column.
    """
    # Make a copy of original dataset
    df = df.copy()
    # Convert to datetime
    df['release_date'] = pd.to_datetime(df['release_date'])
    df['date'] = pd.to_datetime(df['date'])

    # Create a new column for real-time simulation
    df['value_real_time'] = df['value']
    
    # Censor future data (simulate only knowing data available as of cutoff_date)
    df.loc[df['release_date'] > pd.to_datetime(cutoff_date), 'value_real_time'] = np.nan

    return df

In [4]:
# Get real-time dataset
rt_df = get_real_time_dataset(bot_set_df, 
                              "2020-01-31" # Cutoff date
                             ) 

In [5]:
# Loop through each unique series in rt_df and print missing values count
# This is for verification purposes
for series in rt_df['series_name'].unique():
    df_series = rt_df[rt_df['series_name'] == series].copy()
    missing_values_count = df_series['value_real_time'].isna().sum()
    print(f"Series: {series} | Missing Values in 'value': {missing_values_count}")

Series: Total Government Expenditure | Missing Values in 'value': 0
Series: Import Volume Index (exclude Gold) | Missing Values in 'value': 0
Series: Export Volume Index (exclude Gold) | Missing Values in 'value': 0
Series: Private Consumption Index (Seasonally Adjusted)  | Missing Values in 'value': 0
Series: Private Investment Index (PII) (Seasonally Adjusted) | Missing Values in 'value': 0
Series: Business Sentiment Index of Investment | Missing Values in 'value': 0
Series: Nominal Effective Exchange Rate (NEER) | Missing Values in 'value': 1
Series: Real Effective Exchange Rate (REER) | Missing Values in 'value': 2
Series: Export Value Index (THB) | Missing Values in 'value': 0
Series: Import Value Index (THB) | Missing Values in 'value': 0
Series: Retail Sales Index | Missing Values in 'value': 1
Series: Retail Sales Index Durable Goods | Missing Values in 'value': 1
Series: Wholesales Index Durable Goods | Missing Values in 'value': 1
Series: Other Business Sentiment Indices Expo

## Seasonal Adjustment

In [6]:
# Import X13 ARIMA adjustment method
from statsmodels.tsa.x13 import x13_arima_analysis
# Import regex
import re

# Define helper function to check seasonally adjusted label
def is_seasonally_adjusted(series_name):
    """
    Determines whether a series is already seasonally adjusted based on its name.

    The check is based on common suffixes such as 'seasonally adjusted' or 'sa',
    ignoring case and trailing punctuation.

    Parameters
    ----------
    series_name : str
        The name of the time series to check.

    Returns
    -------
    bool
        True if the series appears to be seasonally adjusted, False otherwise.
    """
    name = series_name.lower().strip()
    name = re.sub(r'[\s\.\)\-]+$', '', name)  # Remove trailing punctuation/spaces
    return name.endswith('seasonally adjusted') or name.endswith('sa')

# Define function to apply X-13 ARIMA seasonal adjustment
def apply_x13(series):
    """
    Apply the X-13 ARIMA seasonal adjustment to a time series, dropping NaNs temporarily.
    Returns the adjusted series aligned with the original index (including NaNs).
    
    Parameters
    ----------
    series (pd.Series): The time series data (should be in pandas Series format with DateTime index).
    
    Returns
    -------
    pd.Series: The seasonally adjusted time series with NaNs preserved.
    If X-13 fails, the original series is returned as a fallback.
    """
    try:
        # Save the original index
        original_index = series.index

        # Drop NaNs for adjustment
        series_clean = series.dropna()

        # Apply X-13 ARIMA seasonal adjustment
        res = x13_arima_analysis(series_clean, freq='M', outlier=True, 
                                 x12path='/usr/local/bin/x13as')

        # Extract adjusted values
        seasadj = res.seasadj

        # Create full series with original index (NaNs preserved)
        adjusted_series = pd.Series(index=original_index, dtype='float64')
        adjusted_series.loc[series_clean.index] = seasadj

        return adjusted_series

    except Exception as e:
        print(f"Error applying X-13 ARIMA adjustment: {e}")
        return series  # Return unadjusted if adjustment fails

In [7]:
# Define function for seasonal adjustment
def seasonal_adjust(df, value_col='value_real_time', date_col='date', series_col='series_name'):
    """
    Apply X-13 ARIMA seasonal adjustment to a multi-series DataFrame.

    Parameters
    -------
    df (pd.DataFrame): DataFrame with at least columns: [date_col, value_col, series_col]
    value_col (str): Column name with the values to adjust (default: 'value')
    date_col (str): Column name with the date (default: 'date')
    series_col (str): Column name with the series name (default: 'series_name')

    Returns
    -------
    pd.DataFrame: Same DataFrame with a new 'sea_adj' column containing adjusted values.
    """
    # Initialize list to hold adjusted series
    adjusted_dfs = []

    # Process each series individually
    for series in df[series_col].unique():
        # Subset the DataFrame for the current series
        df_series = df[df[series_col] == series].copy()

        # Check whether the series needs seasonal adjustment
        if not is_seasonally_adjusted(series):
            print(f"[ADJUSTING] {series}")
            # Apply X-13 ARIMA seasonal adjustment
            adjusted_series = apply_x13(df_series.set_index(date_col)[value_col])
            df_series['sea_adj'] = adjusted_series.values
        else:
            print(f"[SKIPPING] Already adjusted: {series}")
            # Series already adjusted — just copy the original values
            df_series['sea_adj'] = df_series[value_col]

        # Append result to list
        adjusted_dfs.append(df_series)

    result_df = pd.concat(adjusted_dfs).sort_values(by=date_col).reset_index(drop=True)
    return result_df

In [8]:
# Apply seasonal adjustment
seasonally_adjusted_df = seasonal_adjust(rt_df)

[ADJUSTING] Total Government Expenditure
[ADJUSTING] Import Volume Index (exclude Gold)
[ADJUSTING] Export Volume Index (exclude Gold)
[SKIPPING] Already adjusted: Private Consumption Index (Seasonally Adjusted) 
[SKIPPING] Already adjusted: Private Investment Index (PII) (Seasonally Adjusted)
[ADJUSTING] Business Sentiment Index of Investment


          in the estimated spectrum of the regARIMA residuals.


[ADJUSTING] Nominal Effective Exchange Rate (NEER)
[ADJUSTING] Real Effective Exchange Rate (REER)
[ADJUSTING] Export Value Index (THB)


          been found in the estimated spectrum of the regARIMA residuals.
          been found in the estimated spectrum of the regARIMA residuals.
  
          found in one or more of the estimated spectra.
          been found in the estimated spectrum of the regARIMA residuals.
  
          found in one or more of the estimated spectra.


[ADJUSTING] Import Value Index (THB)


          found in the estimated spectrum of the regARIMA residuals.
  
          found in one or more of the estimated spectra.
          found in the estimated spectrum of the regARIMA residuals.
  
          found in one or more of the estimated spectra.


[ADJUSTING] Retail Sales Index
[ADJUSTING] Retail Sales Index Durable Goods


          found in the estimated spectrum of the regARIMA residuals.
  
          found in one or more of the estimated spectra.


[ADJUSTING] Wholesales Index Durable Goods


          found in the estimated spectrum of the regARIMA residuals.
  
          found in one or more of the estimated spectra.
          found in one or more of the estimated spectra.


[ADJUSTING] Other Business Sentiment Indices Export conditions
[ADJUSTING] Service Production Index Wholesale and retail trade
[ADJUSTING] Service Production Index Real estate, renting and business activities
[ADJUSTING] SET Index


          been found in the estimated spectrum of the regARIMA residuals.
  
          found in one or more of the estimated spectra.


## Transformation and Stationarity

In [9]:
# Import the Augmented Dickey-Fuller (ADF) test
from statsmodels.tsa.stattools import adfuller

# Define helper function for applying stationarity transformation
def apply_stationarity_transformation(df, series_column='series_name', value_column='sea_adj'):
    """
    Apply stationarity transformations to each time series in the DataFrame.

    This function performs the Augmented Dickey-Fuller (ADF) test for each time series.
    If the series is non-stationary, it applies first or second differencing as needed.

    Parameters
    ----------
    df (pd.DataFrame): Input DataFrame with time series data.
    series_column (str): The name of the column containing the series names.
    value_column (str): The name of the column containing the time series values to be tested.

    Returns
    -------
    pd.DataFrame: DataFrame with the stationary series and the applied transformations.
    """
    stationary_dfs = []  # List to store processed DataFrames

    # Iterate over each unique series
    for series in df[series_column].unique():
        # Extract data for the current series
        df_series = df[df[series_column] == series].copy()

        # Store full index and original values
        full_index = df_series.index
        original_series = df_series[value_column]

        # Drop NA values and select the value column for ADF test
        series_data = original_series.dropna()

        # Perform ADF test to check for stationarity
        pval = adfuller(series_data)[1]

        if pval > 0.05:
            # Apply first differencing if the series is non-stationary
            stat_series = series_data.diff()

            # Re-test after first differencing
            pval_diff1 = adfuller(stat_series.dropna())[1]

            if pval_diff1 > 0.05:
                # Apply second differencing if still non-stationary
                stat_series = stat_series.diff()
                transform = 'diff2'
            else:
                transform = 'diff1'
        else:
            # No differencing needed if already stationary
            stat_series = series_data
            transform = 'none'

        # Create a full-length series aligned with original index (NaNs where needed)
        stat_aligned = pd.Series(index=series_data.index, dtype='float64')
        stat_aligned.loc[stat_series.index] = stat_series

        # Reindex to full index to preserve alignment (including NaNs)
        stat_aligned = stat_aligned.reindex(full_index)

        # Assign the transformation type to a new column
        df_series['stationarity_transform'] = transform

        # Log the transformation applied to the current series
        print(f"[ADF] {series}: Transformation applied = {transform}")

        # Assign the stationary series to a new column
        df_series['stat_value'] = stat_aligned.values

        # Append the transformed series to the list
        stationary_dfs.append(df_series)

    # Combine all transformed series into a single DataFrame
    return pd.concat(stationary_dfs).sort_values(by='date')

In [10]:
# Get stationary series
stationary_df = apply_stationarity_transformation(seasonally_adjusted_df)

[ADF] Total Government Expenditure: Transformation applied = none
[ADF] Nominal Effective Exchange Rate (NEER): Transformation applied = diff1
[ADF] Retail Sales Index Durable Goods: Transformation applied = none
[ADF] Service Production Index Real estate, renting and business activities: Transformation applied = diff1
[ADF] Export Volume Index (exclude Gold): Transformation applied = none
[ADF] Retail Sales Index: Transformation applied = diff1
[ADF] Real Effective Exchange Rate (REER): Transformation applied = diff1
[ADF] Import Volume Index (exclude Gold): Transformation applied = none
[ADF] SET Index: Transformation applied = diff1
[ADF] Business Sentiment Index of Investment: Transformation applied = none
[ADF] Service Production Index Wholesale and retail trade: Transformation applied = diff1
[ADF] Private Consumption Index (Seasonally Adjusted) : Transformation applied = diff1
[ADF] Import Value Index (THB): Transformation applied = diff1
[ADF] Export Value Index (THB): Transfor

In [11]:
# Loop through each unique series in rt_df and print missing values count
for series in stationary_df['series_name'].unique():
    df_series = stationary_df[stationary_df['series_name'] == series].copy()
    missing_values_count = df_series['stat_value'].isna().sum()
    print(f"Series: {series} | Missing Values in 'stat_value': {missing_values_count}")

Series: Total Government Expenditure | Missing Values in 'stat_value': 0
Series: Real Effective Exchange Rate (REER) | Missing Values in 'stat_value': 3
Series: Private Consumption Index (Seasonally Adjusted)  | Missing Values in 'stat_value': 1
Series: Other Business Sentiment Indices Export conditions | Missing Values in 'stat_value': 0
Series: Retail Sales Index Durable Goods | Missing Values in 'stat_value': 1
Series: Service Production Index Wholesale and retail trade | Missing Values in 'stat_value': 1
Series: Import Volume Index (exclude Gold) | Missing Values in 'stat_value': 0
Series: Nominal Effective Exchange Rate (NEER) | Missing Values in 'stat_value': 2
Series: Wholesales Index Durable Goods | Missing Values in 'stat_value': 2
Series: Retail Sales Index | Missing Values in 'stat_value': 2
Series: Private Investment Index (PII) (Seasonally Adjusted) | Missing Values in 'stat_value': 1
Series: Service Production Index Real estate, renting and business activities | Missing V

### Handle Missing Values

In [12]:
# Define helper function for imputing missing values with backward fill
def impute_leading_with_bfill(df, group_column='series_name', value_column='stat_value'):
    """
    Apply backward fill to leading NaNs in each time series group.

    Parameters
    ----------
    df (pd.DataFrame): DataFrame with time series.
    group_column (str): Column identifying different time series.
    value_column (str): Column with the values to be imputed.

    Returns
    -------
    pd.DataFrame: DataFrame with leading NaNs backfilled.
    """
    # Make a copy to avoid modifying the original DataFrame
    df = df.copy()
    # Backward fill missing values within each group and store in a new column
    df['imputed_val'] = df.groupby(group_column)[value_column].transform(lambda x: x.bfill())
    
    return df

In [13]:
# Impute first missing value with backward fill
bfilled_df = impute_leading_with_bfill(stationary_df)

## Handle Outliers

In [14]:
# Define helper function for outlier detection
def detect_outliers(df, group_col='series_name', value_col='sea_adj'):
    """
    Detect outliers using Hyndman's conservative 3*IQR rule applied to the series remainder.
    
    Parameters
    ----------
    df (pd.DataFrame): DataFrame with time series including seasonally adjusted values.
    group_col (str): Column indicating separate time series (e.g., 'series_name').
    value_col (str): Column with seasonally adjusted values to check for outliers (e.g., 'sea_adj').
    
    Returns
    -------
    pd.DataFrame: DataFrame with outliers flagged.
    """
    # Initialize list to hold flagged series
    flagged_dfs = []
    
    # Process each series in the group
    for series, group in df.groupby(group_col):
        data = group.copy()
        series_values = data[value_col]
        
        # Compute 3*IQR bounds
        q1 = series_values.quantile(0.25)
        q3 = series_values.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 3 * iqr
        upper_bound = q3 + 3 * iqr

        # Identify outliers
        outliers = (series_values < lower_bound) | (series_values > upper_bound)
        n_outliers = outliers.sum()

        if n_outliers > 0:
            print(f"[OUTLIERS] {series}: {n_outliers} outliers detected")

        # Flag outliers without replacing them
        data['outlier_flag'] = outliers

        # Append flagged series
        flagged_dfs.append(data)

    return pd.concat(flagged_dfs).reset_index(drop=True)

In [15]:
# Detect and flag outliers
flagged_df = detect_outliers(bfilled_df, value_col = 'imputed_val')

[OUTLIERS] Export Volume Index (exclude Gold): 1 outliers detected
[OUTLIERS] Nominal Effective Exchange Rate (NEER): 2 outliers detected
[OUTLIERS] Private Consumption Index (Seasonally Adjusted) : 1 outliers detected
[OUTLIERS] Private Investment Index (PII) (Seasonally Adjusted): 2 outliers detected
[OUTLIERS] Real Effective Exchange Rate (REER): 2 outliers detected
[OUTLIERS] Retail Sales Index: 1 outliers detected
[OUTLIERS] Retail Sales Index Durable Goods: 2 outliers detected
[OUTLIERS] Service Production Index Real estate, renting and business activities: 6 outliers detected
[OUTLIERS] Service Production Index Wholesale and retail trade: 1 outliers detected


In [16]:
# Replace outliers with NaN for interpolation
outlier_handled_df = flagged_df.copy()
outlier_handled_df['cleaned_val'] = outlier_handled_df['imputed_val']\
.where(~outlier_handled_df['outlier_flag'], np.nan)

outlier_handled_df['outlier_corrected'] = (
    outlier_handled_df
    .groupby('series_name')['cleaned_val']
    .transform(lambda group: group.interpolate(method='linear', limit_direction='both'))
)

## Handle Ragged Data

In [17]:
# Import Kalman filter
from pykalman import KalmanFilter

# Define helper function to forecast and fill trailing missing values using a Kalman Filter
def forecast_tail_with_kalman(series, series_name=None):
    """
    Forecast and fill tail-end missing values in a time series using the Kalman Filter.
    
    This function is designed to handle time series where missing values occur 
    at the end (e.g., due to real-time simulation or delayed release). It uses 
    the Kalman Filter to forecast and impute those values.

    Parameters
    ----------
    series (pd.Series): Time series with potential missing values at the end.
    series_name (str, optional): Name of the time series, used for logging purposes.

    Returns
    -------
    pd.Series: Time series with tail-end NaNs replaced by Kalman-based forecasts.
    """
    # Ensure series is float for Kalman filter
    series = series.astype(float)

    # Count how many NaNs are at the tail of the series
    tail_nan_count = series[::-1].isna().cumprod().sum()

    # No missing values to forecast
    if tail_nan_count == 0:
        return series

    # Set default name if none provided
    name = series_name if series_name is not None else "Unnamed series"
    print(f"[KALMAN] {name}: Forecasting {tail_nan_count} tail-end missing values...")

    # Extract observed (non-missing) part of the series
    observed = series[:-tail_nan_count]

    # Not enough data to estimate the filter
    if len(observed) < 2:
        return series

    # Initialize and fit Kalman Filter using EM algorithm
    kf = KalmanFilter(initial_state_mean=observed.iloc[0], n_dim_obs=1)
    kf = kf.em(observed.values, n_iter=10)
    filtered_state_means, _ = kf.filter(observed.values)

    # Forecast missing values step-by-step using the transition matrix
    forecasts = []
    state = filtered_state_means[-1]

    for _ in range(tail_nan_count):
        state = kf.transition_matrices @ state
        forecasts.append(state[0])

    # Replace tail-end NaNs with forecasted values
    result = series.copy()
    result.iloc[-tail_nan_count:] = forecasts

    return result

# Define  helper function for applying Kalman Filter
def apply_kalman_tail_forecast(df, group_column='series_name', 
                               input_column='outlier_corrected'):
    """
    Apply Kalman filter to forecast and impute trailing missing values 
    for each time series group.
    
    Parameters
    ----------
    df (pd.DataFrame): DataFrame with imputed values (e.g., after backward fill).
    group_column (str): Column identifying different time series.
    input_column (str): Column containing the values to apply Kalman forecasting on.
    
    Returns
    -------
    pd.DataFrame: DataFrame with a new column 'final_val' containing Kalman-imputed values.
    """
    df = df.copy()
    
    # Initialize output column
    df['final_val'] = np.nan
    
    # Apply Kalman filter to each group
    for name, group in df.groupby(group_column):
        series = group[input_column]
        kalman_imputed = forecast_tail_with_kalman(series, series_name=name)
        df.loc[group.index, 'final_val'] = kalman_imputed
    
    return df

In [18]:
# Apply Kalman filter for filling up missing values
imputed_df = apply_kalman_tail_forecast(outlier_handled_df)

In [19]:
# Subset imputed_df to keep only certain columns
monthly_df_final = imputed_df[['series_code', 'series_name', 'date', 'value']]

# Export preprocessed monthly dataset 
# Uncomment if needed
#monthly_df_final.to_csv('preprocessed-dataset-nowcasting-thai-gdp-monthly-jan.csv', 
#                        index=False)

## Get Preprocessed Quarterly Data

In [20]:
# Create a copy of imputed_df
monthly_df = imputed_df.copy()

# Add a quarter column
monthly_df['quarter'] = monthly_df['date'].dt.to_period('Q')

# Aggregate Flows with sum
flows = (
    monthly_df[monthly_df['data_type'] == 'Flow']
    .groupby(['series_code', 'series_name', 'quarter'], as_index=False)['final_val']
    .sum()
    .rename(columns={'final_val': 'value'})
)

# Aggregate Index with mean
indices = (
    monthly_df[monthly_df['data_type'] == 'Index']
    .groupby(['series_code', 'series_name', 'quarter'], as_index=False)['final_val']
    .mean()
    .rename(columns={'final_val': 'value'})
)

# Combine back together
quarterly_df_final = pd.concat([flows, indices], ignore_index=True)

In [21]:
# Export preprocessed dataset
# Uncomment if needed
#quarterly_df_final.to_csv('preprocessed-dataset-nowcasting-thai-gdp-quarterly-jan.csv', 
#                       index=False)