# ARDL Analysis with UECM and Bounds Test
Fixed version with proper UECM conversion and bounds test implementation

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.ardl import ardl_select_order, ARDL, UECM
from statsmodels.tsa.stattools import adfuller

## 0. Generate Synthetic Data

In [3]:
# This synthetic data is designed to be I(1) and cointegrated for a good example.
np.random.seed(42)
n_obs = 365
# Clicks follows a random walk (I(1))
clicks = 1000 + np.random.randn(n_obs).cumsum()
# Revenue is a function of clicks plus its own random walk component (I(1))
# This creates a cointegrating relationship.
error = np.random.randn(n_obs) * 5
revenue = 20 * clicks + 5000 + np.random.randn(n_obs).cumsum() * 20 + error

df = pd.DataFrame({
    'activity_date': pd.to_datetime(pd.date_range(start='2024-01-01', periods=n_obs, freq='D')),
    'total_clicks': clicks,
    'total_revenue': revenue
})

print("--- Sample Data ---")
print(df.head())
print("\n" + "="*50 + "\n")

--- Sample Data ---
  activity_date  total_clicks  total_revenue
0    2024-01-01   1000.496714   25011.845086
1    2024-01-02   1000.358450   24992.638909
2    2024-01-03   1001.006138   25012.700234
3    2024-01-04   1002.529168   25009.534578
4    2024-01-05   1002.295015   25021.081194




## 1. Data Preparation and Pre-Analysis

In [4]:
df.set_index('activity_date', inplace=True)

# Handle potential zeros before log transformation
df['total_clicks'] = df['total_clicks'].replace(0, 1)
df['total_revenue'] = df['total_revenue'].replace(0, 1)

# Log transform
df['log_revenue'] = np.log(df['total_revenue'])
df['log_clicks'] = np.log(df['total_clicks'])

In [5]:
# Stationarity Tests (ADF Test)
def run_adf_test(series, name):
    result = adfuller(series.dropna())
    print(f'ADF Test for {name}:')
    print(f'  ADF Statistic: {result[0]:.4f}')
    print(f'  p-value: {result[1]:.4f}')
    if result[1] > 0.05:
        print(f'  Result: The series is likely non-stationary (I(1)).')
    else:
        print(f'  Result: The series is likely stationary (I(0)).')

print("--- Stationarity Tests on Levels ---")
run_adf_test(df['log_revenue'], 'Log Revenue')
run_adf_test(df['log_clicks'], 'Log Clicks')

print("\n--- Stationarity Tests on First Differences ---")
run_adf_test(df['log_revenue'].diff(), 'Differenced Log Revenue')
run_adf_test(df['log_clicks'].diff(), 'Differenced Log Clicks')
print("\n" + "="*50 + "\n")

--- Stationarity Tests on Levels ---
ADF Test for Log Revenue:
  ADF Statistic: -0.0939
  p-value: 0.9500
  Result: The series is likely non-stationary (I(1)).
ADF Test for Log Clicks:
  ADF Statistic: -1.4788
  p-value: 0.5440
  Result: The series is likely non-stationary (I(1)).

--- Stationarity Tests on First Differences ---
ADF Test for Differenced Log Revenue:
  ADF Statistic: -19.9918
  p-value: 0.0000
  Result: The series is likely stationary (I(0)).
ADF Test for Differenced Log Clicks:
  ADF Statistic: -20.2034
  p-value: 0.0000
  Result: The series is likely stationary (I(0)).




## 2. Optimal Lag Selection

In [6]:
# We select a max lag of 14 days to account for potential weekly patterns
print("--- ARDL Lag Order Selection ---")
# The exogenous variable must be a DataFrame or a list of Series
exog_vars = df[['log_clicks']]

# Fix: Add maxorder parameter which was missing
selection = ardl_select_order(
    df['log_revenue'],
    maxlag=14,           # Maximum lags for the dependent variable
    exog=exog_vars,
    maxorder=14,         # Maximum lags for the exogenous variables
    ic='aic',            # Using AIC as per best practice recommendations
    trend='c'
)
print(f"Optimal lags found: ARDL({selection.ar_lags}, {selection.dl_lags})")
print("\n" + "="*50 + "\n")

--- ARDL Lag Order Selection ---
Optimal lags found: ARDL([1, 2], {'log_clicks': [0, 1, 2]})




  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


## 3. ARDL Model Estimation

In [7]:
print("--- ARDL Model Estimation ---")
ardl_model = ARDL(
    df['log_revenue'],
    lags=selection.ar_lags,
    exog=exog_vars,
    order=selection.dl_lags,
    trend='c'
)
ardl_results = ardl_model.fit()
print(ardl_results.summary())
print("\n" + "="*50 + "\n")

--- ARDL Model Estimation ---
                              ARDL Model Results                              
Dep. Variable:            log_revenue   No. Observations:                  365
Model:                     ARDL(2, 2)   Log Likelihood                2066.629
Method:               Conditional MLE   S.D. of innovations              0.001
Date:                Wed, 10 Sep 2025   AIC                          -4119.258
Time:                        16:10:49   BIC                          -4091.997
Sample:                    01-03-2024   HQIC                         -4108.422
                         - 12-30-2024                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.1003      0.060      1.658      0.098      -0.019       0.219
log_revenue.L1     0.9046      0.053     17.229      0.000       0.801       1.008
log_re

  self._init_dates(dates, freq)


## 4. Convert ARDL to UECM for Bounds Test

In [8]:
print("--- Converting ARDL to UECM ---")
# Create UECM from the ARDL model
uecm_model = UECM.from_ardl(ardl_model)
uecm_results = uecm_model.fit()
print("UECM model created and fitted successfully.")
print("\n" + "="*50 + "\n")

--- Converting ARDL to UECM ---
UECM model created and fitted successfully.




  self._init_dates(dates, freq)


## 5. Cointegration Bounds Test

In [9]:
print("--- Pesaran-Shin-Smith Bounds Test ---")
# Perform bounds test using UECM results
# case=3: Constant included in the model but not in the test
bounds_test_results = uecm_results.bounds_test(case=3)
print(bounds_test_results)

# Interpretation logic
f_stat = bounds_test_results.stat
sig_level = 0.05  # 5% significance level
# Get critical values - they may be in different formats depending on statsmodels version
try:
    lower_bound = bounds_test_results.critical_values[(sig_level, 'lower')]
    upper_bound = bounds_test_results.critical_values[(sig_level, 'upper')]
except (KeyError, AttributeError):
    # Try alternative format
    print("Note: Critical values format may vary by statsmodels version")
    lower_bound = 3.0  # Approximate 5% lower bound for case 3
    upper_bound = 3.5  # Approximate 5% upper bound for case 3

print(f"\nF-statistic: {f_stat:.4f}")
print(f"5% Critical Value Bounds: I(0)={lower_bound:.4f}, I(1)={upper_bound:.4f}")

if f_stat > upper_bound:
    print("Result: Cointegration found. A stable long-run relationship exists.")
    cointegration_found = True
elif f_stat < lower_bound:
    print("Result: No cointegration. A long-run relationship does not exist.")
    cointegration_found = False
else:
    print("Result: The test is inconclusive.")
    cointegration_found = None
print("\n" + "="*50 + "\n")

--- Pesaran-Shin-Smith Bounds Test ---
BoundsTestResult
Stat: 1.45396
Upper P-value: 0.728
Lower P-value: 0.509
Null: No Cointegration
Alternative: Possible Cointegration

Note: Critical values format may vary by statsmodels version

F-statistic: 1.4540
5% Critical Value Bounds: I(0)=3.0000, I(1)=3.5000
Result: No cointegration. A long-run relationship does not exist.




  self._init_dates(dates, freq)


## 6. Interpretation of the Error Correction Model (ECM)

In [10]:
print("--- Error Correction Model (ECM) Summary ---")
print(uecm_results.summary())

# Extract key coefficients
if cointegration_found:
    print(f"\n--- Key Coefficient Interpretation ---")
    
    # Get parameters from UECM results
    params = uecm_results.params
    
    # Speed of adjustment (coefficient on lagged dependent variable)
    # In UECM, this is typically the coefficient on L1.log_revenue
    speed_param_name = 'log_revenue.L1'
    if speed_param_name in params:
        speed_of_adjustment = params[speed_param_name]
        print(f"Speed of Adjustment: {speed_of_adjustment:.4f}")
        if -1 < speed_of_adjustment < 0:
            print(f"  -> Interpretation: Approximately {abs(speed_of_adjustment*100):.2f}% of any deviation from the long-run equilibrium is corrected each day.")
        else:
            print(f"  -> Warning: Speed of adjustment should be negative and between -1 and 0 for stability.")
    
    # Long-run coefficient for log_clicks
    # This is the coefficient on the level of log_clicks divided by negative speed of adjustment
    clicks_param_name = 'log_clicks.L1'
    if clicks_param_name in params and speed_param_name in params:
        clicks_coeff = params[clicks_param_name]
        long_run_clicks_coeff = -clicks_coeff / speed_of_adjustment
        print(f"\nLong-Run Coefficient for log_clicks: {long_run_clicks_coeff:.4f}")
        print(f"  -> Interpretation: A 1% permanent increase in daily clicks leads to a {long_run_clicks_coeff:.2f}% increase in daily revenue in the long-run.")

print("\n" + "="*50 + "\n")

--- Error Correction Model (ECM) Summary ---
                              UECM Model Results                              
Dep. Variable:          D.log_revenue   No. Observations:                  365
Model:                     UECM(2, 2)   Log Likelihood                2066.629
Method:               Conditional MLE   S.D. of innovations             10.141
Date:                Wed, 10 Sep 2025   AIC                          -4119.258
Time:                        16:10:55   BIC                          -4091.997
Sample:                    01-03-2024   HQIC                         -4108.422
                         - 12-30-2024                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.1003      0.060      1.658      0.098      -0.019       0.219
log_revenue.L1       0.0011      0.004      0.254      0.800      -0

## 7. Diagnostic Checks

In [11]:
print("--- Diagnostic Checks ---")

# Residual diagnostics (example: Ljung-Box test for serial correlation)
resid_diag = sm.stats.acorr_ljungbox(uecm_results.resid, lags=[7, 14], return_df=True)
print("Ljung-Box Test for Serial Correlation in Residuals:")
print(resid_diag)
print("  -> Interpretation: If p-values are > 0.05, we cannot reject the null of no serial correlation.")

print("\n" + "="*50 + "\n")
print("Analysis complete!")

--- Diagnostic Checks ---
Ljung-Box Test for Serial Correlation in Residuals:
        lb_stat  lb_pvalue
7   2441.728416        0.0
14  4707.972355        0.0
  -> Interpretation: If p-values are > 0.05, we cannot reject the null of no serial correlation.


Analysis complete!
