# Project LEGO - Module Test

In [None]:
# Import required modules
from Technic.data import DataManager
from Technic.feature import Interaction
from Technic.condition import create_conditional_var
import pandas as pd
import numpy as np

# Load your data manager (assuming it's already initialized)
# dm = DataManager(...)


In [None]:
# 1. Simple Multiplication Interaction
gdp_unemp = Interaction(
    vars=['GDP', 'UNRATE'],
    interaction_type='multiply'  # default type
)

# 2. Ratio Interaction with Lag
gdp_lag_unemp = Interaction(
    vars=['GDP', 'UNRATE'],
    interaction_type='ratio',  # uses safe division (adds small constant to denominator)
    lag=1  # lags UNRATE by 1 period
)

# 3. Polynomial Interaction (GDP² * UNRATE)
gdp_sq_unemp = Interaction(
    vars=['GDP', 'UNRATE'],
    interaction_type='polynomial',
    powers=[2, 1]  # GDP squared, UNRATE linear
)

# 4. Multiple Variable Interaction
gdp_unemp_cpi = Interaction(
    vars=['GDP', 'UNRATE', 'CPI'],
    interaction_type='multiply'
)

# Use these in feature building
features = dm.build_features([
    'GDP',
    'UNRATE',
    gdp_unemp,          # GDP * UNRATE
    gdp_lag_unemp,      # GDP / UNRATE(t-1)
    gdp_sq_unemp,       # GDP² * UNRATE
    gdp_unemp_cpi       # GDP * UNRATE * CPI
])


In [None]:
# 1. High Unemployment Regime (UNRATE > 10%)
def high_unemp(series):
    """When unemployment > 10%, use the rate; otherwise 0"""
    return series.where(series > 0.10, other=0)

create_conditional_var(
    dm,
    main_var='UNRATE',
    condition=high_unemp,
    alias='UNEMP_REGIME'
)

# 2. GDP in Recession (negative growth)
def recession_gdp(series):
    """When GDP growth is negative, use GDP; otherwise 0"""
    return series.where(series.pct_change() < 0, other=0)

create_conditional_var(
    dm,
    main_var='GDP',
    condition=recession_gdp,
    alias='GDP_RECESSION'
)

# 3. Conditional on Another Variable
def gdp_in_high_unemp(series, unrate):
    """GDP values during high unemployment periods"""
    return series.where(unrate > 0.10, other=0)

create_conditional_var(
    dm,
    main_var='GDP',
    # Use lambda to access other variables
    condition=lambda x: gdp_in_high_unemp(x, dm.model_mev['UNRATE']),
    alias='GDP_HIGH_UNEMP'
)

# 4. Multiple Conditions
def stagflation(series, gdp, cpi):
    """
    Unemployment during stagflation periods:
    - GDP growth < 1%
    - Inflation (CPI) > 5%
    """
    gdp_growth = gdp.pct_change()
    cpi_growth = cpi.pct_change()
    stagflation_mask = (gdp_growth < 0.01) & (cpi_growth > 0.05)
    return series.where(stagflation_mask, other=0)

create_conditional_var(
    dm,
    main_var='UNRATE',
    condition=lambda x: stagflation(
        x, 
        dm.model_mev['GDP'],
        dm.model_mev['CPI']
    ),
    alias='UNEMP_STAGFLATION'
)

# 5. Return Series without Adding to Data
def extreme_values(series):
    """Identify extreme values (>2 std from mean)"""
    mean = series.mean()
    std = series.std()
    return series.where((series > mean + 2*std) | (series < mean - 2*std), other=0)

extreme_gdp = create_conditional_var(
    dm,
    main_var='GDP',
    condition=extreme_values,
    alias='GDP_EXTREME',
    add_to_data=False  # Don't add to datasets, just return the series
)


In [None]:
# Check interaction terms
print("=== Interaction Terms ===")
print("\nGDP * UNRATE:")
print(features[['GDP', 'UNRATE', 'GDP_UNRATE_MUL']].head())

print("\nGDP / UNRATE(t-1):")
print(features[['GDP', 'UNRATE', 'GDP_UNRATE_RATIO_LAG1']].head())

print("\nGDP² * UNRATE:")
print(features[['GDP', 'UNRATE', 'GDP_UNRATE_POW2,1']].head())

# Check conditional variables
print("\n=== Conditional Variables ===")
print("\nHigh Unemployment Regime:")
print(dm.model_mev[['UNRATE', 'UNEMP_REGIME']].head())

print("\nGDP during High Unemployment:")
print(dm.model_mev[['GDP', 'UNRATE', 'GDP_HIGH_UNEMP']].head())

print("\nUnemployment during Stagflation:")
print(dm.model_mev[['UNRATE', 'GDP', 'CPI', 'UNEMP_STAGFLATION']].head())


### Import Package and Libs

In [1]:
import Technic as tc
import pandas as pd

### Load Data

In [2]:
# create DataLoader
data_ldr = tc.PPNRInternalLoader()

# Load data with proper parameters
data_ldr.load(
    source='fake_internal.xlsx',
    date_col='Date'  # Specify the date column name from your Excel
)

# Print first few rows to verify
print("\nFirst few rows:")
print(data_ldr.internal_data.head())


First few rows:
            Fixed_balance  Fixed_price  Redeemable_balance  Redeemable_price  \
Date                                                                           
2018-01-31    1525.766256     0.054260          808.925168          0.018648   
2018-02-28    1540.515474     0.046348          835.771506          0.034777   
2018-03-31    1566.467979     0.026517          863.370334          0.017302   
2018-04-30    1563.129764     0.022823          896.188563          0.036981   
2018-05-31    1583.883081     0.045697          908.819056          0.032661   

             VR_balance  VR_price  
Date                               
2018-01-31  1022.483571  0.045600  
2018-02-28  1041.792249  0.023520  
2018-03-31  1065.030692  0.025023  
2018-04-30  1092.645841  0.013759  
2018-05-31  1111.475074  0.033131  


### Test Different Features of PPNRInternalLoader


#### 1. Test Sample Splitting with Time Cutoff


In [3]:
# Create loader with in-sample period specified
data_ldr_split = tc.PPNRInternalLoader(
    in_sample_start="2018-01-01",
    in_sample_end="2019-06-30"
)

# Load the same data
data_ldr_split.load(
    source='fake_internal.xlsx',
    date_col='Date'
)

# Get in-sample and out-of-sample data
in_sample_data = data_ldr_split.internal_data.loc[data_ldr_split.in_sample_idx]
out_sample_data = data_ldr_split.internal_data.loc[data_ldr_split.out_sample_idx]

print("In-sample period data:")
print(f"Start: {in_sample_data.index.min()}")
print(f"End: {in_sample_data.index.max()}")
print(f"Number of observations: {len(in_sample_data)}\n")

print("Out-of-sample period data:")
print(f"Start: {out_sample_data.index.min()}")
print(f"End: {out_sample_data.index.max()}")
print(f"Number of observations: {len(out_sample_data)}")


In-sample period data:
Start: 2018-01-31 00:00:00
End: 2019-06-30 00:00:00
Number of observations: 18

Out-of-sample period data:
Start: 2019-07-31 00:00:00
End: 2023-11-30 00:00:00
Number of observations: 53


#### 2. Test Different Frequency Handling


In [4]:
# Create loader with quarterly frequency
data_ldr_q = tc.PPNRInternalLoader(freq='Q')

# Load the same data - it will be automatically converted to quarter-end dates
data_ldr_q.load(
    source='fake_internal.xlsx',
    date_col='Date'
)

# Compare the first few rows of monthly vs quarterly data
print("Monthly frequency data (original):")
print(data_ldr.internal_data.head(), "\n")

print("Quarterly frequency data (converted):")
print(data_ldr_q.internal_data.head())


Monthly frequency data (original):
            Fixed_balance  Fixed_price  Redeemable_balance  Redeemable_price  \
Date                                                                           
2018-01-31    1525.766256     0.054260          808.925168          0.018648   
2018-02-28    1540.515474     0.046348          835.771506          0.034777   
2018-03-31    1566.467979     0.026517          863.370334          0.017302   
2018-04-30    1563.129764     0.022823          896.188563          0.036981   
2018-05-31    1583.883081     0.045697          908.819056          0.032661   

             VR_balance  VR_price  
Date                               
2018-01-31  1022.483571  0.045600  
2018-02-28  1041.792249  0.023520  
2018-03-31  1065.030692  0.025023  
2018-04-30  1092.645841  0.013759  
2018-05-31  1111.475074  0.033131   

Quarterly frequency data (converted):
            Fixed_balance  Fixed_price  Redeemable_balance  Redeemable_price  \
Date                            

#### 3. Test Full Sample Period Constraints

In [5]:
# Create loader with full sample period constraints
data_ldr_constrained = tc.PPNRInternalLoader(
    in_sample_start="2018-06-01",
    in_sample_end="2019-06-30",
    full_sample_start="2018-03-01",  # Start a bit later
    full_sample_end="2019-12-31"     # End a bit earlier
)

# Load the data
data_ldr_constrained.load(
    source='fake_internal.xlsx',
    date_col='Date'
)

# Show the data ranges
full_data = data_ldr_constrained.internal_data
in_sample = full_data.loc[data_ldr_constrained.in_sample_idx]
out_sample = full_data.loc[data_ldr_constrained.out_sample_idx]

print("Full data range with constraints:")
print(f"Start: {full_data.index.min()}")
print(f"End: {full_data.index.max()}\n")

print("In-sample period:")
print(f"Start: {in_sample.index.min()}")
print(f"End: {in_sample.index.max()}\n")

print("Out-of-sample period:")
print(f"Start: {out_sample.index.min()}")
print(f"End: {out_sample.index.max()}")


Full data range with constraints:
Start: 2018-01-31 00:00:00
End: 2023-11-30 00:00:00

In-sample period:
Start: 2018-06-30 00:00:00
End: 2019-06-30 00:00:00

Out-of-sample period:
Start: 2019-07-31 00:00:00
End: 2019-12-31 00:00:00


#### 4. Test Loading from DataFrame


In [6]:
# Create a sample DataFrame
import pandas as pd
import numpy as np

dates = pd.date_range(start='2018-01-01', end='2019-12-31', freq='M')
sample_df = pd.DataFrame({
    'Date': dates,
    'Value': np.random.randn(len(dates)) * 100 + 1000
})

# Create loader and load from DataFrame
data_ldr_df = tc.PPNRInternalLoader()
data_ldr_df.load(
    source=sample_df,
    date_col='Date'
)

print("Data loaded from DataFrame:")
print(data_ldr_df.internal_data.head())

# Verify the index is properly set
print("\nIndex properties:")
print(f"Index type: {type(data_ldr_df.internal_data.index)}")
print(f"Frequency: {data_ldr_df.internal_data.index.freqstr}")


Data loaded from DataFrame:
                  Value
Date                   
2018-01-31  1112.698711
2018-02-28  1079.304995
2018-03-31   926.166426
2018-04-30   812.336866
2018-05-31   857.927135

Index properties:
Index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
Frequency: M


#### 5. Test Loading Without date_col Parameter


In [7]:
# Test Case 1: DataFrame with DatetimeIndex
print("Test Case 1: DataFrame with DatetimeIndex")
print("-" * 40)

# Create a DataFrame with DatetimeIndex
dates_index = pd.date_range(start='2018-01-01', end='2019-12-31', freq='M')
df_with_index = pd.DataFrame({
    'Value1': np.random.randn(len(dates_index)) * 100 + 1000,
    'Value2': np.random.randn(len(dates_index)) * 50 + 500
}, index=dates_index)

print("Original DataFrame with DatetimeIndex:")
print(df_with_index.head(), "\n")

# Load without specifying date_col
data_ldr_indexed = tc.PPNRInternalLoader()
data_ldr_indexed.load(source=df_with_index)

print("Loaded data (should preserve the index):")
print(data_ldr_indexed.internal_data.head(), "\n")
print(f"Index type: {type(data_ldr_indexed.internal_data.index)}")
print(f"Frequency: {data_ldr_indexed.internal_data.index.freqstr}\n\n")


Test Case 1: DataFrame with DatetimeIndex
----------------------------------------
Original DataFrame with DatetimeIndex:
                 Value1      Value2
2018-01-31  1183.977935  512.167354
2018-02-28   858.603155  563.516392
2018-03-31  1082.256032  560.291478
2018-04-30  1302.497407  578.676383
2018-05-31  1137.632037  456.047272 

Loaded data (should preserve the index):
                 Value1      Value2
2018-01-31  1183.977935  512.167354
2018-02-28   858.603155  563.516392
2018-03-31  1082.256032  560.291478
2018-04-30  1302.497407  578.676383
2018-05-31  1137.632037  456.047272 

Index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
Frequency: M




In [8]:
# Test Case 2: DataFrame without DatetimeIndex
print("Test Case 2: DataFrame without DatetimeIndex")
print("-" * 40)

# Create a DataFrame without DatetimeIndex (regular numeric index)
df_without_index = pd.DataFrame({
    'Date': dates_index,  # Date as a column instead of index
    'Value1': np.random.randn(len(dates_index)) * 100 + 1000,
    'Value2': np.random.randn(len(dates_index)) * 50 + 500
})

print("Original DataFrame without DatetimeIndex:")
print(df_without_index.head(), "\n")

# Try to load without specifying date_col (should raise an error)
data_ldr_no_index = tc.PPNRInternalLoader()
try:
    data_ldr_no_index.load(source=df_without_index)
    print("Loaded data (shouldn't reach here):")
    print(data_ldr_no_index.internal_data.head())
except ValueError as e:
    print("Expected error raised:")
    print(e)


Test Case 2: DataFrame without DatetimeIndex
----------------------------------------
Original DataFrame without DatetimeIndex:
        Date       Value1      Value2
0 2018-01-31   997.955475  498.654784
1 2018-02-28   999.515419  571.159186
2 2018-03-31   937.649897  399.392914
3 2018-04-30  1071.443102  495.720292
4 2018-05-31  1132.465135  491.003424 

Expected error raised:
date_col required when DataFrame does not have datetime index


### Test MEVLoader Class


In [9]:
# Create MEVLoader instance
mev_ldr = tc.MEVLoader()

# Load base MEVs from the 'base' sheet
print("1. Loading base MEVs...")
mev_ldr.load(
    source='fake_scens.xlsx',
    sheet='base'
)

print("\nQuarterly base MEVs:")
print(mev_ldr.model_mev_qtr.head())

# Check MEV codes and their types
print("\nMEV codes and their types:")
for code in mev_ldr.mev_codes:
    info = mev_ldr.get_mev_info(code)
    print(f"{code}: {info['type']} - {info['description']}")


1. Loading base MEVs...

Quarterly base MEVs:
nan           NGDP       PSR      PDI UNRATE   CPI
2000-03-31  100.16  10032.57  8047.46   3.64  3.08
2000-06-30   99.74   9996.18  8044.02   4.42  3.96
2000-09-30   99.77   9940.38  8035.14   4.82   9.3
2000-12-31   99.54  10063.83  7958.35   6.23  9.12
2001-03-31    99.8   9925.75  8018.19   5.85  7.18

MEV codes and their types:
CPI: rate - Consumer Price Index
NGDP: level - Nominal GDP
PDI: level - Personal Disposable Income
PSR: rate - Personal Savings Rate
UNRATE: rate - Unemployment Rate


In [10]:
# Load scenario MEVs from 'adv' and 'sev' sheets
print("2. Loading scenario MEVs...")
mev_ldr.load_scens(
    source='fake_scens.xlsx',
    scens={
        'Base': 'base',
        'Adverse': 'adv',
        'Severe': 'sev'
    },
    set_name='EWST2024'
)

# Print available scenario sets
print("\nAvailable scenario sets:", list(mev_ldr.scen_mev_qtr.keys()))

# Print scenarios in EWST2024
print("\nScenarios in EWST2024:", list(mev_ldr.scen_mev_qtr['EWST2024'].keys()))

# Compare scenarios for a specific MEV (e.g., NGDP)
print("\nComparing NGDP across scenarios:")
for scen_name, scen_data in mev_ldr.scen_mev_qtr['EWST2024'].items():
    print(f"\n{scen_name} scenario:")
    print(scen_data['NGDP'].head())


2. Loading scenario MEVs...

Available scenario sets: ['EWST2024']

Scenarios in EWST2024: ['Base', 'Adverse', 'Severe']

Comparing NGDP across scenarios:

Base scenario:
2000-03-31    100.16
2000-06-30     99.74
2000-09-30     99.77
2000-12-31     99.54
2001-03-31      99.8
Freq: Q-DEC, Name: NGDP, dtype: object

Adverse scenario:
2000-03-31    110.17
2000-06-30    109.71
2000-09-30    109.74
2000-12-31     109.5
2001-03-31    109.78
Freq: Q-DEC, Name: NGDP, dtype: object

Severe scenario:
2000-03-31    120.19
2000-06-30    119.69
2000-09-30    119.72
2000-12-31    119.45
2001-03-31    119.76
Freq: Q-DEC, Name: NGDP, dtype: object


In [11]:
# Test updating scenario MEVs
print("3. Testing scenario updates...")

# Create a sample update for NGDP in Base scenario
import numpy as np

# Get existing dates from Base scenario
base_dates = mev_ldr.scen_mev_qtr['EWST2024']['Base'].index[:5]
updated_values = np.array([101.0, 102.0, 103.0, 104.0, 105.0])

update_df = pd.DataFrame({
    'NGDP': updated_values
}, index=base_dates)

# Update the Base scenario
mev_ldr.update_scen_mevs(
    {'Base': update_df},
    set_name='EWST2024'
)

print("\nNGDP in Base scenario after update:")
print(mev_ldr.scen_mev_qtr['EWST2024']['Base']['NGDP'].head())


3. Testing scenario updates...

NGDP in Base scenario after update:
2000-03-31    101.0
2000-06-30    102.0
2000-09-30    103.0
2000-12-31    104.0
2001-03-31    105.0
Freq: Q-DEC, Name: NGDP, dtype: float64


In [12]:
# Test cleaning operations
print("4. Testing cleaning operations...")

# Clean specific scenario set
print("\nCleaning EWST2024 scenario set...")
mev_ldr.clean_scen_mevs(set_name='EWST2024')
print("Scenario sets after cleaning EWST2024:", list(mev_ldr.scen_mev_qtr.keys()))

# Clean model MEVs
print("\nCleaning model MEVs...")
mev_ldr.clean_model_mevs()
print("Model MEV shape after cleaning:", mev_ldr.model_mev_qtr.shape)

# Clean everything
print("\nCleaning all data...")
mev_ldr.clean_all()
print("MEV codes after cleaning:", mev_ldr.mev_codes)
print("Scenario sets after cleaning:", list(mev_ldr.scen_mev_qtr.keys()))


4. Testing cleaning operations...

Cleaning EWST2024 scenario set...
Scenario sets after cleaning EWST2024: []

Cleaning model MEVs...
Model MEV shape after cleaning: (0, 0)

Cleaning all data...
MEV codes after cleaning: []
Scenario sets after cleaning: []


In [13]:
# Test transform mappings
print("5. Testing transform mappings...")

# Print available transforms for each MEV type
print("\nAvailable transforms by MEV type:")
for mev_type, transforms in mev_ldr.tsfm_map.items():
    print(f"{mev_type}: {transforms}")

# Print transform types for specific MEVs
print("\nTransform types for specific MEVs:")
test_mevs = ['NGDP', 'UNRATE', 'CPI']
for mev in test_mevs:
    mev_type = mev_ldr.get_mev_info(mev)['type']
    transforms = mev_ldr.tsfm_map.get(mev_type, [])
    print(f"{mev} ({mev_type}): {transforms}")


5. Testing transform mappings...

Available transforms by MEV type:
level: ['LV', 'GR']
rate: ['LV', 'DF', 'GR']

Transform types for specific MEVs:
NGDP (level): ['LV', 'GR']
UNRATE (rate): ['LV', 'DF', 'GR']
CPI (rate): ['LV', 'DF', 'GR']


### Test DataManager Class


#### 1. Test DataManager Initialization and Basic Data Access


In [14]:
# Create and load internal data loader
internal_loader = tc.PPNRInternalLoader(
    in_sample_start="2018-01-01",
    in_sample_end="2019-06-30"
)
internal_loader.load(
    source='fake_internal.xlsx',
    date_col='Date'
)

# Create and load MEV loader
mev_loader = tc.MEVLoader()
mev_loader.load('fake_scens.xlsx', sheet='base')  # Load model MEVs
mev_loader.load_scens(            # Load scenario MEVs
    'fake_scens.xlsx',
    scens={"Base": "base", "Adverse": "adv", "Severe": "sev"}
)

# Create DataManager
dm = tc.DataManager(internal_loader, mev_loader)

# Test basic data access
print("Internal Data Shape:", dm.internal_data.shape)
print("Internal Data Columns:", dm.internal_data.columns.tolist())
print("\nIn-Sample Data Range:")
print(f"Start: {dm.internal_in.index.min()}")
print(f"End: {dm.internal_in.index.max()}")
print("\nOut-Sample Data Range:")
print(f"Start: {dm.internal_out.index.min()}")
print(f"End: {dm.internal_out.index.max()}")


Internal Data Shape: (71, 6)
Internal Data Columns: ['Fixed_balance', 'Fixed_price', 'Redeemable_balance', 'Redeemable_price', 'VR_balance', 'VR_price']

In-Sample Data Range:
Start: 2018-01-31 00:00:00
End: 2019-06-30 00:00:00

Out-Sample Data Range:
Start: 2019-07-31 00:00:00
End: 2023-11-30 00:00:00


#### 2. Test MEV Data Interpolation and Caching


In [15]:
# Get model MEV data
model_mev = dm.model_mev

# Check interpolation results
print("Model MEV Data:")
print("Shape:", model_mev.shape)
print("Columns:", model_mev.columns.tolist())
print("\nFirst few rows:")
print(model_mev.head())

# Test caching by accessing again
model_mev2 = dm.model_mev
print("\nVerify cache is working (should be instant):")
print("Same object:", model_mev2 is model_mev)  # Should be True if caching works

# Check time indicators
print("\nTime indicators:")
print(model_mev[['M', 'Q']].head())


Model MEV Data:
Shape: (370, 7)
Columns: ['NGDP', 'PSR', 'PDI', 'UNRATE', 'CPI', 'M', 'Q']

First few rows:
nan               NGDP           PSR          PDI    UNRATE       CPI  M  Q
2000-03-31  100.160000  10032.570000  8047.460000  3.640000  3.080000  3  1
2000-04-30   99.896506  10049.097386  8036.334431  4.098958  2.067059  4  2
2000-05-31   99.768751  10031.340067  8037.305959  4.332339  2.532845  5  2
2000-06-30   99.740000   9996.180000  8044.020000  4.420000  3.960000  6  2
2000-07-31   99.761353   9957.712737  8050.351074  4.466939  5.934583  7  3

Verify cache is working (should be instant):
Same object: True

Time indicators:
nan         M  Q
2000-03-31  3  1
2000-04-30  4  2
2000-05-31  5  2
2000-06-30  6  2
2000-07-31  7  3


#### 3. Test Scenario MEV Handling


In [16]:
# Get scenario MEVs
scen_mevs = dm.scen_mevs

# Print structure and content
print("Scenario Sets:", list(scen_mevs.keys()))
for set_name, scenarios in scen_mevs.items():
    print(f"\nScenario Set: {set_name}")
    print("Scenarios:", list(scenarios.keys()))
    
    # Print details of first scenario
    first_scen = next(iter(scenarios.values()))
    print(f"First scenario shape: {first_scen.shape}")
    print(f"First scenario columns: {first_scen.columns.tolist()}")
    print("\nFirst few rows:")
    print(first_scen.head())


Scenario Sets: ['fake_scens']

Scenario Set: fake_scens
Scenarios: ['Severe', 'Adverse', 'Base']
First scenario shape: (370, 7)
First scenario columns: ['NGDP', 'PSR', 'PDI', 'UNRATE', 'CPI', 'M', 'Q']

First few rows:
nan               NGDP          PSR          PDI    UNRATE       CPI  M  Q
2000-03-31  120.190000  9029.310000  9656.950000  3.270000  3.690000  3  1
2000-04-30  119.878426  9044.184086  9643.600764  3.691235  2.475176  4  2
2000-05-31  119.726034  9028.203043  9644.765534  3.903280  3.035652  5  2
2000-06-30  119.690000  8996.560000  9652.820000  3.980000  4.750000  6  2
2000-07-31  119.712911  8961.940402  9660.414604  4.017966  7.121091  7  3


#### 4. Test Data Modification Methods


In [17]:
# Test apply_to_internal
def add_internal_features(df):
    # Add a new column in-place
    df['Total_Balance'] = df['Fixed_balance'] + df['Redeemable_balance'] + df['VR_balance']
    
    # Return a new feature
    return pd.Series(
        df['Total_Balance'].pct_change(),
        name='Balance_Growth'
    )

dm.apply_to_internal(add_internal_features)

# Verify changes
print("Internal Data after modifications:")
print("New columns:", dm.internal_data.columns.tolist())
print("\nFirst few rows of new features:")
print(dm.internal_data[['Total_Balance', 'Balance_Growth']].head())

# Test apply_to_mevs
def add_mev_features(mev_df, internal_df):
    # Add a ratio of two MEVs
    mev_df['NGDP_per_UNRATE'] = mev_df['NGDP'] / mev_df['UNRATE']
    return mev_df

dm.apply_to_mevs(add_mev_features)

# Verify changes in both model and scenario MEVs
print("\nModel MEV after modifications:")
print("New columns:", dm.model_mev.columns.tolist())
print("\nFirst few rows of new feature:")
print(dm.model_mev['NGDP_per_UNRATE'].head())

print("\nScenario MEVs after modifications:")
for set_name, scenarios in dm.scen_mevs.items():
    print(f"\nScenario Set: {set_name}")
    for scen_name, df in scenarios.items():
        print(f"Scenario {scen_name} columns:", df.columns.tolist())


Internal Data after modifications:
New columns: ['Fixed_balance', 'Fixed_price', 'Redeemable_balance', 'Redeemable_price', 'VR_balance', 'VR_price', 'Total_Balance', 'Balance_Growth']

First few rows of new features:
            Total_Balance  Balance_Growth
Date                                     
2018-01-31    3357.174994             NaN
2018-02-28    3418.079229        0.018142
2018-03-31    3494.869005        0.022466
2018-04-30    3551.964168        0.016337
2018-05-31    3604.177211        0.014700

Model MEV after modifications:
New columns: ['NGDP', 'PSR', 'PDI', 'UNRATE', 'CPI', 'NGDP_per_UNRATE', 'M', 'Q']

First few rows of new feature:
2000-03-31    27.516484
2000-04-30    24.920392
2000-05-31    23.360011
2000-06-30    22.565611
2000-07-31    22.097281
Freq: M, Name: NGDP_per_UNRATE, dtype: float64

Scenario MEVs after modifications:

Scenario Set: fake_scens
Scenario Severe columns: ['NGDP', 'PSR', 'PDI', 'UNRATE', 'CPI', 'NGDP_per_UNRATE', 'M', 'Q']
Scenario Adverse col

### Test Enhanced MEV Handling in DataManager


In [18]:
# Create sample monthly and quarterly MEV data
import pandas as pd
import numpy as np

# Create quarterly MEV data
dates_qtr = pd.date_range('2018-01-01', '2023-12-31', freq='Q')
mev_qtr = pd.DataFrame({
    'GDP': np.random.normal(100, 10, len(dates_qtr)),
    'UNRATE': np.random.normal(5, 1, len(dates_qtr)),
    'CPI': np.random.normal(200, 20, len(dates_qtr))
}, index=dates_qtr)

# Create monthly MEV data with some overlapping variables
dates_mth = pd.date_range('2018-01-01', '2023-12-31', freq='M')
mev_mth = pd.DataFrame({
    'GDP': np.random.normal(100, 10, len(dates_mth)),  # Overlapping with quarterly
    'HOUSING': np.random.normal(150, 15, len(dates_mth)),  # Monthly only
    'CPI': np.random.normal(200, 20, len(dates_mth))  # Overlapping with quarterly
}, index=dates_mth)

# Initialize MEV loader with sample MEV map
mev_map = {
    'GDP': {'type': 'level', 'description': 'Gross Domestic Product'},
    'UNRATE': {'type': 'rate', 'description': 'Unemployment Rate'},
    'CPI': {'type': 'level', 'description': 'Consumer Price Index'},
    'HOUSING': {'type': 'level', 'description': 'Housing Index'}
}

# Create and load MEV loader
mev_loader = tc.MEVLoader(mev_map=mev_map)
mev_loader._model_mev_qtr = mev_qtr
mev_loader._model_mev_mth = mev_mth


#### Test 1: Monthly Frequency Data Handling


In [19]:
# Create DataManager with monthly internal data
data_ldr_mth = tc.PPNRInternalLoader(freq='M')
data_ldr_mth.load(source='fake_internal.xlsx', date_col='Date')

# Create DataManager
dm_mth = tc.DataManager(data_ldr_mth, mev_loader)

# Get combined MEV data
mev_combined = dm_mth.model_mev

# Check the results
print("Combined MEV Data Structure:")
print(f"Total columns: {len(mev_combined.columns)}")
print("\nColumn names:")
for col in sorted(mev_combined.columns):
    print(f"- {col}")

print("\nVerify MEV handling:")
print("1. Monthly-only variables (should be as-is):")
print(f"- HOUSING exists: {'HOUSING' in mev_combined.columns}")

print("\n2. Overlapping variables:")
print("- Original monthly data:")
print(f"- GDP exists: {'GDP' in mev_combined.columns}")
print("- Interpolated quarterly data with '_Q' suffix:")
print(f"- GDP_Q exists: {'GDP_Q' in mev_combined.columns}")

print("\n3. Quarterly-only variables:")
print(f"- UNRATE exists with '_Q' suffix: {'UNRATE_Q' in mev_combined.columns}")

# Check if the MEV map was updated
print("\nMEV Map Updates:")
mev_map = dm_mth.mev_map
print("Derived MEV descriptions:")
for code in ['GDP_Q', 'UNRATE_Q', 'CPI_Q']:
    if code in mev_map:
        print(f"- {code}: {mev_map[code]['description']}")


Combined MEV Data Structure:
Total columns: 8

Column names:
- CPI
- CPI_Q
- GDP
- GDP_Q
- HOUSING
- M
- Q
- UNRATE

Verify MEV handling:
1. Monthly-only variables (should be as-is):
- HOUSING exists: True

2. Overlapping variables:
- Original monthly data:
- GDP exists: True
- Interpolated quarterly data with '_Q' suffix:
- GDP_Q exists: True

3. Quarterly-only variables:
- UNRATE exists with '_Q' suffix: False

MEV Map Updates:
Derived MEV descriptions:
- GDP_Q: Gross Domestic Product (Interpolated from quarterly)
- CPI_Q: Consumer Price Index (Interpolated from quarterly)




#### Test 2: Quarterly Frequency Data Handling


In [20]:
# Create DataManager with quarterly internal data
data_ldr_qtr = tc.PPNRInternalLoader(freq='Q')
data_ldr_qtr.load(source='fake_internal.xlsx', date_col='Date')

# Create DataManager
dm_qtr = tc.DataManager(data_ldr_qtr, mev_loader)

# Get combined MEV data
mev_combined = dm_qtr.model_mev

# Check the results
print("Combined MEV Data Structure:")
print(f"Total columns: {len(mev_combined.columns)}")
print("\nColumn names:")
for col in sorted(mev_combined.columns):
    print(f"- {col}")

print("\nVerify MEV handling:")
print("1. Quarterly-only variables (should be as-is):")
print(f"- UNRATE exists: {'UNRATE' in mev_combined.columns}")

print("\n2. Overlapping variables:")
print("- Original quarterly data:")
print(f"- GDP exists: {'GDP' in mev_combined.columns}")
print("- Monthly-derived data with '_M' suffix:")
print(f"- GDP_M exists: {'GDP_M' in mev_combined.columns}")

print("\n3. Monthly-only variables:")
print(f"- HOUSING exists with '_M' suffix: {'HOUSING_M' in mev_combined.columns}")

# Check if the MEV map was updated
print("\nMEV Map Updates:")
mev_map = dm_qtr.mev_map
print("Derived MEV descriptions:")
for code in ['GDP_M', 'HOUSING_M', 'CPI_M']:
    if code in mev_map:
        print(f"- {code}: {mev_map[code]['description']}")

# Verify quarterly averaging
print("\nQuarterly Averaging Check:")
print("Checking if monthly data was properly averaged to quarters...")

# Get a sample overlapping variable (GDP)
if 'GDP' in mev_combined.columns and 'GDP_M' in mev_combined.columns:
    sample_quarter = mev_combined.index[0]
    monthly_data = mev_mth.loc[f"{sample_quarter.year}-{sample_quarter.quarter}"]
    if len(monthly_data) == 3:  # Only check if we have all 3 months
        expected_avg = monthly_data['GDP'].mean()
        actual_avg = mev_combined.loc[sample_quarter, 'GDP_M']
        print(f"\nFor quarter {sample_quarter}:")
        print(f"Monthly values: {monthly_data['GDP'].values}")
        print(f"Calculated quarterly average: {expected_avg:.4f}")
        print(f"Value in combined data: {actual_avg:.4f}")
        print(f"Averages match: {abs(expected_avg - actual_avg) < 1e-10}")


Combined MEV Data Structure:
Total columns: 8

Column names:
- CPI
- CPI_M
- GDP
- GDP_M
- HOUSING
- M
- Q
- UNRATE

Verify MEV handling:
1. Quarterly-only variables (should be as-is):
- UNRATE exists: True

2. Overlapping variables:
- Original quarterly data:
- GDP exists: True
- Monthly-derived data with '_M' suffix:
- GDP_M exists: True

3. Monthly-only variables:
- HOUSING exists with '_M' suffix: False

MEV Map Updates:
Derived MEV descriptions:
- GDP_M: Gross Domestic Product (Averaged from monthly)
- CPI_M: Consumer Price Index (Averaged from monthly)

Quarterly Averaging Check:
Checking if monthly data was properly averaged to quarters...




#### Test 3: Refresh and Cache Behavior


In [21]:
# Test cache behavior and refresh functionality
dm = tc.DataManager(data_ldr_mth, mev_loader)

# First access to trigger cache creation
print("Initial access:")
mev1 = dm.model_mev
print(f"Number of columns: {len(mev1.columns)}")

# Second access should use cache
print("\nSecond access (should use cache):")
mev2 = dm.model_mev
print(f"Same object: {mev1 is mev2}")  # Should be True due to caching

# Modify MEV data
print("\nModifying MEV data and refreshing:")
new_gdp = np.random.normal(100, 10, len(mev_qtr))
mev_loader._model_mev_qtr['GDP'] = new_gdp

# Access without refresh (should still use cache)
mev3 = dm.model_mev
print(f"Still using cache: {mev1 is mev3}")  # Should be True

# Refresh and access again
dm.refresh()
mev4 = dm.model_mev
print(f"New data after refresh: {mev1 is mev4}")  # Should be False

# Verify data was actually updated
print("\nVerifying data update:")
if 'GDP_Q' in mev4.columns:
    print("GDP_Q values changed:", not (mev1['GDP_Q'] == mev4['GDP_Q']).all())


Initial access:
Number of columns: 8

Second access (should use cache):
Same object: True

Modifying MEV data and refreshing:
Still using cache: False
New data after refresh: False

Verifying data update:
GDP_Q values changed: True




#### Test 4: Suffix Addition Logic


In [22]:
# Test suffix addition logic for overlapping vs non-overlapping variables

# Create test data with specific overlap patterns
dates_qtr = pd.date_range('2018-01-01', '2023-12-31', freq='Q')
dates_mth = pd.date_range('2018-01-01', '2023-12-31', freq='M')

# Quarterly data:
# - GDP: overlaps with monthly
# - UNRATE: quarterly only
# - CPI: overlaps with monthly
mev_qtr_test = pd.DataFrame({
    'GDP': np.random.normal(100, 10, len(dates_qtr)),
    'UNRATE': np.random.normal(5, 1, len(dates_qtr)),
    'CPI': np.random.normal(200, 20, len(dates_qtr))
}, index=dates_qtr)

# Monthly data:
# - GDP: overlaps with quarterly
# - HOUSING: monthly only
# - CPI: overlaps with quarterly
mev_mth_test = pd.DataFrame({
    'GDP': np.random.normal(100, 10, len(dates_mth)),
    'HOUSING': np.random.normal(150, 15, len(dates_mth)),
    'CPI': np.random.normal(200, 20, len(dates_mth))
}, index=dates_mth)

# Update MEV loader with test data
mev_loader._model_mev_qtr = mev_qtr_test
mev_loader._model_mev_mth = mev_mth_test

print("Test 1: Monthly Frequency Data")
print("-" * 50)
dm_mth = tc.DataManager(data_ldr_mth, mev_loader)
mev_monthly = dm_mth.model_mev

print("Column names in combined data:")
for col in sorted(mev_monthly.columns):
    print(f"- {col}")

print("\nVerifying suffix logic:")
print("1. Overlapping variables (should have both original and _Q):")
print(f"- GDP exists: {'GDP' in mev_monthly.columns}")
print(f"- GDP_Q exists: {'GDP_Q' in mev_monthly.columns}")
print(f"- CPI exists: {'CPI' in mev_monthly.columns}")
print(f"- CPI_Q exists: {'CPI_Q' in mev_monthly.columns}")

print("\n2. Non-overlapping variables (should NOT have suffix):")
print(f"- UNRATE exists without suffix: {'UNRATE' in mev_monthly.columns}")
print(f"- UNRATE_Q does NOT exist: {'UNRATE_Q' not in mev_monthly.columns}")
print(f"- HOUSING exists without suffix: {'HOUSING' in mev_monthly.columns}")

print("\nTest 2: Quarterly Frequency Data")
print("-" * 50)
dm_qtr = tc.DataManager(data_ldr_qtr, mev_loader)
mev_quarterly = dm_qtr.model_mev

print("Column names in combined data:")
for col in sorted(mev_quarterly.columns):
    print(f"- {col}")

print("\nVerifying suffix logic:")
print("1. Overlapping variables (should have both original and _M):")
print(f"- GDP exists: {'GDP' in mev_quarterly.columns}")
print(f"- GDP_M exists: {'GDP_M' in mev_quarterly.columns}")
print(f"- CPI exists: {'CPI' in mev_quarterly.columns}")
print(f"- CPI_M exists: {'CPI_M' in mev_quarterly.columns}")

print("\n2. Non-overlapping variables (should NOT have suffix):")
print(f"- UNRATE exists without suffix: {'UNRATE' in mev_quarterly.columns}")
print(f"- HOUSING exists without suffix: {'HOUSING' in mev_quarterly.columns}")
print(f"- HOUSING_M does NOT exist: {'HOUSING_M' not in mev_quarterly.columns}")

# Verify MEV map updates
print("\nMEV Map Updates")
print("-" * 50)
mev_map = dm_mth.mev_map
print("1. Checking derived MEVs in map:")
print("Overlapping variables (should have derived entries):")
for code in ['GDP_Q', 'CPI_Q', 'GDP_M', 'CPI_M']:
    if code in mev_map:
        print(f"- {code}: {mev_map[code]['description']}")

print("\nNon-overlapping variables (should NOT have derived entries):")
for code in ['UNRATE_Q', 'HOUSING_M']:
    print(f"- {code} exists in map: {code in mev_map}")




Test 1: Monthly Frequency Data
--------------------------------------------------


Column names in combined data:
- CPI
- CPI_Q
- GDP
- GDP_Q
- HOUSING
- M
- Q
- UNRATE

Verifying suffix logic:
1. Overlapping variables (should have both original and _Q):
- GDP exists: True
- GDP_Q exists: True
- CPI exists: True
- CPI_Q exists: True

2. Non-overlapping variables (should NOT have suffix):
- UNRATE exists without suffix: True
- UNRATE_Q does NOT exist: True
- HOUSING exists without suffix: True

Test 2: Quarterly Frequency Data
--------------------------------------------------
Column names in combined data:
- CPI
- CPI_M
- GDP
- GDP_M
- HOUSING
- M
- Q
- UNRATE

Verifying suffix logic:
1. Overlapping variables (should have both original and _M):
- GDP exists: True
- GDP_M exists: True
- CPI exists: True
- CPI_M exists: True

2. Non-overlapping variables (should NOT have suffix):
- UNRATE exists without suffix: True
- HOUSING exists without suffix: True
- HOUSING_M does NOT exist: True

MEV Map Updates
--------------------------------------------------
1. Checking deri



#### Test 5: Transform Specification Building


In [23]:
# Test transform specification building with different variable types
from Technic.transform import TSFM

# Create test variables with different types
test_vars = ['GDP', 'UNRATE', 'HOUSING', 'NonExistentVar']

# Build transform specifications
print("Building transform specifications...")
specs = dm_mth.build_tsfm_specs(test_vars, max_lag=1, max_periods=2)

print("\nResults for each variable:")
for var, transforms in specs.items():
    print(f"\n{var}:")
    if isinstance(transforms[0], str):
        print("- Raw only (no type mapping)")
    else:
        print("Transforms:")
        for t in transforms:
            print(f"- {t}")

# Test build_search_vars
print("\nTesting build_search_vars...")
var_dfs = dm_mth.build_search_vars(test_vars, max_lag=1, max_periods=2)

print("\nGenerated features for each variable:")
for var, df in var_dfs.items():
    print(f"\n{var}:")
    print("Features:", df.columns.tolist())




Building transform specifications...

Results for each variable:

GDP:
Transforms:
- TSFM:GDP_LV
- TSFM:GDP_LV_L1
- TSFM:GDP_GR
- TSFM:GDP_GR_L1
- TSFM:GDP_GR2
- TSFM:GDP_GR2_L1

UNRATE:
Transforms:
- TSFM:UNRATE_LV
- TSFM:UNRATE_LV_L1
- TSFM:UNRATE_DF
- TSFM:UNRATE_DF_L1
- TSFM:UNRATE_DF2
- TSFM:UNRATE_DF2_L1
- TSFM:UNRATE_GR
- TSFM:UNRATE_GR_L1
- TSFM:UNRATE_GR2
- TSFM:UNRATE_GR2_L1

HOUSING:
Transforms:
- TSFM:HOUSING_LV
- TSFM:HOUSING_LV_L1
- TSFM:HOUSING_GR
- TSFM:HOUSING_GR_L1
- TSFM:HOUSING_GR2
- TSFM:HOUSING_GR2_L1

NonExistentVar:
- Raw only (no type mapping)

Testing build_search_vars...


KeyError: "Feature 'NonExistentVar' not found in data sources."

## Testing Combined Quarterly and Monthly MEV Data

This section tests the new functionality in DataManager that handles both quarterly and monthly MEV data, including:
1. Combining quarterly and monthly model MEVs
2. Combining quarterly and monthly scenario MEVs
3. Handling overlapping variables with appropriate suffixes
4. Verifying data frequency matches internal data


In [24]:
# Create synthetic scenario data for testing
print("Creating synthetic scenario data...")
print("-" * 50)

# Create date ranges
dates_qtr = pd.date_range('2018-01-01', '2023-12-31', freq='Q')
dates_mth = pd.date_range('2018-01-01', '2023-12-31', freq='M')

# Create quarterly scenario data
scen_qtr = {
    'EWST2024': {
        'Base': pd.DataFrame({
            'GDP': np.random.normal(100, 5, len(dates_qtr)),
            'UNRATE': np.random.normal(5, 0.5, len(dates_qtr)),
            'CPI': np.random.normal(200, 10, len(dates_qtr))
        }, index=dates_qtr),
        'Adverse': pd.DataFrame({
            'GDP': np.random.normal(90, 5, len(dates_qtr)),
            'UNRATE': np.random.normal(7, 0.5, len(dates_qtr)),
            'CPI': np.random.normal(220, 10, len(dates_qtr))
        }, index=dates_qtr)
    }
}

# Create monthly scenario data
scen_mth = {
    'EWST2024': {
        'Base': pd.DataFrame({
            'GDP': np.random.normal(100, 2, len(dates_mth)),
            'HOUSING': np.random.normal(150, 5, len(dates_mth)),
            'CPI': np.random.normal(200, 5, len(dates_mth))
        }, index=dates_mth),
        'Adverse': pd.DataFrame({
            'GDP': np.random.normal(90, 2, len(dates_mth)),
            'HOUSING': np.random.normal(130, 5, len(dates_mth)),
            'CPI': np.random.normal(220, 5, len(dates_mth))
        }, index=dates_mth)
    }
}

# Update MEV loader with synthetic data
dm._mev_loader._scen_mev_qtr = scen_qtr
dm._mev_loader._scen_mev_mth = scen_mth

# Force refresh to clear caches
dm.refresh()

print("Created scenario data:")
print("\nQuarterly scenarios:")
for set_name, scenarios in scen_qtr.items():
    print(f"\nScenario set: {set_name}")
    for scen_name, df in scenarios.items():
        print(f"- {scen_name} shape: {df.shape}, variables: {list(df.columns)}")

print("\nMonthly scenarios:")
for set_name, scenarios in scen_mth.items():
    print(f"\nScenario set: {set_name}")
    for scen_name, df in scenarios.items():
        print(f"- {scen_name} shape: {df.shape}, variables: {list(df.columns)}")

# Now test the combined scenario data
print("\nTesting combined scenarios...")
print("-" * 50)


Creating synthetic scenario data...
--------------------------------------------------
Created scenario data:

Quarterly scenarios:

Scenario set: EWST2024
- Base shape: (24, 3), variables: ['GDP', 'UNRATE', 'CPI']
- Adverse shape: (24, 3), variables: ['GDP', 'UNRATE', 'CPI']

Monthly scenarios:

Scenario set: EWST2024
- Base shape: (72, 3), variables: ['GDP', 'HOUSING', 'CPI']
- Adverse shape: (72, 3), variables: ['GDP', 'HOUSING', 'CPI']

Testing combined scenarios...
--------------------------------------------------


In [25]:
# Get combined scenario data
scenarios = dm.scen_mevs

# Test 1: Check dictionary structure
print("\n1. Testing dictionary structure:")
print(f"Number of scenario sets: {len(scenarios)}")
for scen_set, scen_dict in scenarios.items():
    print(f"\nScenario set: {scen_set}")
    print(f"Number of scenarios: {len(scen_dict)}")
    print(f"Available scenarios: {list(scen_dict.keys())}")

# Test 2: Check data combination for each scenario
print("\n2. Testing data combination:")
for scen_set, scen_dict in scenarios.items():
    print(f"\nScenario set: {scen_set}")
    for scen_name, scen_df in scen_dict.items():
        print(f"\nScenario: {scen_name}")
        
        # Check if we have both quarterly and monthly data
        qtr_vars = [col for col in scen_df.columns if col.endswith('_Q')]
        mth_vars = [col for col in scen_df.columns if col.endswith('_M')]
        base_vars = [col for col in scen_df.columns 
                    if not col.endswith(('_Q', '_M')) 
                    and col not in ['M', 'Q']]
        
        print("Variable counts:")
        print(f"- Base variables: {len(base_vars)}")
        print(f"- Quarterly-derived: {len(qtr_vars)}")
        print(f"- Monthly-derived: {len(mth_vars)}")
        
        # Check frequency indicators
        print("\nFrequency indicators:")
        print(f"- Month column (M): {'M' in scen_df.columns}")
        print(f"- Quarter column (Q): {'Q' in scen_df.columns}")
        
        # Verify data frequency matches internal data
        internal_freq = pd.infer_freq(dm.internal_data.index)
        scen_freq = pd.infer_freq(scen_df.index)
        print(f"\nFrequency check:")
        print(f"- Internal data frequency: {internal_freq}")
        print(f"- Scenario data frequency: {scen_freq}")
        
        # Sample of overlapping variables
        overlap_vars = set(qtr_vars).intersection({f"{v}_Q" for v in base_vars})
        if overlap_vars:
            print("\nSample of overlapping variable:")
            var = next(iter(overlap_vars))
            base_var = var[:-2]  # Remove _Q suffix
            print(f"Variable: {base_var}")
            print(scen_df[[base_var, var]].head())



1. Testing dictionary structure:
Number of scenario sets: 1

Scenario set: EWST2024
Number of scenarios: 2
Available scenarios: ['Adverse', 'Base']

2. Testing data combination:

Scenario set: EWST2024

Scenario: Adverse
Variable counts:
- Base variables: 4
- Quarterly-derived: 2
- Monthly-derived: 0

Frequency indicators:
- Month column (M): True
- Quarter column (Q): True

Frequency check:
- Internal data frequency: M
- Scenario data frequency: M

Sample of overlapping variable:
Variable: CPI
                   CPI       CPI_Q
2018-01-31  219.492299         NaN
2018-02-28  221.698095         NaN
2018-03-31  230.696906  234.243527
2018-04-30  217.746060  216.907167
2018-05-31  228.080042  207.848883

Scenario: Base
Variable counts:
- Base variables: 4
- Quarterly-derived: 2
- Monthly-derived: 0

Frequency indicators:
- Month column (M): True
- Quarter column (Q): True

Frequency check:
- Internal data frequency: M
- Scenario data frequency: M

Sample of overlapping variable:
Variable

In [26]:
# Test 3: Check specific variable handling
print("\n3. Testing specific variable handling:")
first_scen = scenarios['EWST2024']['Base']
print("\nVariable categories in Base scenario:")
print("a) Monthly-only variables:")
print(f"- HOUSING exists: {'HOUSING' in first_scen.columns}")

print("\nb) Overlapping variables (GDP, CPI):")
print("GDP handling:")
print(f"- Monthly GDP exists: {'GDP' in first_scen.columns}")
print(f"- Quarterly GDP (_Q) exists: {'GDP_Q' in first_scen.columns}")
print("\nCPI handling:")
print(f"- Monthly CPI exists: {'CPI' in first_scen.columns}")
print(f"- Quarterly CPI (_Q) exists: {'CPI_Q' in first_scen.columns}")

print("\nc) Quarterly-only variables:")
print(f"- UNRATE exists: {'UNRATE' in first_scen.columns}")

# Test 4: Verify caching
print("\n4. Testing caching:")
# Get scenarios again and verify it's using cache
import time
start = time.time()
scenarios2 = dm.scen_mevs
end = time.time()
print(f"Time to retrieve from cache: {(end-start)*1000:.2f}ms")

# Force refresh and measure time
dm.refresh()
start = time.time()
scenarios3 = dm.scen_mevs
end = time.time()
print(f"Time to recompute after refresh: {(end-start)*1000:.2f}ms")



3. Testing specific variable handling:

Variable categories in Base scenario:
a) Monthly-only variables:
- HOUSING exists: True

b) Overlapping variables (GDP, CPI):
GDP handling:
- Monthly GDP exists: True
- Quarterly GDP (_Q) exists: True

CPI handling:
- Monthly CPI exists: True
- Quarterly CPI (_Q) exists: True

c) Quarterly-only variables:
- UNRATE exists: True

4. Testing caching:
Time to retrieve from cache: 10.97ms
Time to recompute after refresh: 27.92ms


## Testing Scenario MEVs with Quarterly Internal Data

Now we'll test how scenario MEVs are handled when the internal data is quarterly frequency. This should:
1. Convert monthly data to quarterly averages
2. Add '_M' suffix to monthly-derived variables
3. Keep quarterly-only variables as-is
4. Maintain proper frequency alignment


In [27]:
# Create a DataManager with quarterly frequency
data_ldr_qtr = tc.PPNRInternalLoader(freq='Q')
data_ldr_qtr.load(
    source='fake_internal.xlsx',
    date_col='Date'
)

# Create new DataManager with quarterly internal data
dm_qtr = tc.DataManager(data_ldr_qtr, dm._mev_loader)

# Get combined scenario data
print("Testing quarterly frequency scenario data...")
print("-" * 50)
scenarios_qtr = dm_qtr.scen_mevs

# Test 1: Check data structure and frequency
print("\n1. Testing data structure and frequency:")
for scen_set, scen_dict in scenarios_qtr.items():
    print(f"\nScenario set: {scen_set}")
    for scen_name, scen_df in scen_dict.items():
        print(f"\nScenario: {scen_name}")
        
        # Verify quarterly frequency
        freq = pd.infer_freq(scen_df.index)
        print(f"Data frequency: {freq}")
        
        # Check variable types
        qtr_vars = [col for col in scen_df.columns if col.endswith('_Q')]
        mth_vars = [col for col in scen_df.columns if col.endswith('_M')]
        base_vars = [col for col in scen_df.columns 
                    if not col.endswith(('_Q', '_M')) 
                    and col not in ['M', 'Q']]
        
        print("\nVariable counts:")
        print(f"- Base variables: {len(base_vars)}")
        print(f"- Monthly-derived (_M): {len(mth_vars)}")  # Should have _M suffix
        print(f"- Quarterly-derived (_Q): {len(qtr_vars)}")  # Should be 0
        
        print("\nVariable categories:")
        print("a) Quarterly variables (no suffix):")
        print([v for v in base_vars if v in scen_qtr[scen_set][scen_name].columns])
        print("\nb) Monthly-derived variables (_M):")
        print(mth_vars)

# Test 2: Verify quarterly averaging of monthly data
print("\n2. Testing quarterly averaging of monthly data:")
base_scen = scenarios_qtr['EWST2024']['Base']
base_scen_mth = scen_mth['EWST2024']['Base']

# Take a sample quarter and verify averaging
sample_quarter = base_scen.index[0]
print(f"\nChecking averaging for quarter {sample_quarter}:")

# Get the corresponding monthly data
quarter_start = pd.Timestamp(f"{sample_quarter.year}-{sample_quarter.quarter*3-2}-01")
quarter_end = pd.Timestamp(f"{sample_quarter.year}-{sample_quarter.quarter*3}-{31 if sample_quarter.quarter in [1,3] else 30}")
monthly_data = base_scen_mth.loc[quarter_start:quarter_end]

# Check GDP averaging
if 'GDP_M' in base_scen.columns:
    monthly_gdp = monthly_data['GDP']
    quarterly_avg = monthly_gdp.mean()
    actual_value = base_scen.loc[sample_quarter, 'GDP_M']
    
    print("\nGDP averaging check:")
    print(f"Monthly values: {monthly_gdp.values}")
    print(f"Calculated average: {quarterly_avg:.4f}")
    print(f"Value in quarterly data: {actual_value:.4f}")
    print(f"Averages match: {abs(quarterly_avg - actual_value) < 1e-10}")

# Test 3: Check specific variable handling
print("\n3. Testing specific variable handling:")
print("\na) Monthly-only variables:")
print(f"- HOUSING exists with _M suffix: {'HOUSING_M' in base_scen.columns}")
print(f"- HOUSING exists without suffix: {'HOUSING' in base_scen.columns}")

print("\nb) Overlapping variables:")
print("GDP handling:")
print(f"- Quarterly GDP exists: {'GDP' in base_scen.columns}")
print(f"- Monthly-derived GDP exists: {'GDP_M' in base_scen.columns}")

print("\nc) Quarterly-only variables:")
print(f"- UNRATE exists without suffix: {'UNRATE' in base_scen.columns}")
print(f"- UNRATE does not have _M suffix: {'UNRATE_M' not in base_scen.columns}")

# Test 4: Verify caching for quarterly data
print("\n4. Testing caching with quarterly data:")
import time

# First access (already done)
start = time.time()
scenarios_qtr2 = dm_qtr.scen_mevs
end = time.time()
print(f"Time to retrieve from cache: {(end-start)*1000:.2f}ms")

# Force refresh and measure recomputation
dm_qtr.refresh()
start = time.time()
scenarios_qtr3 = dm_qtr.scen_mevs
end = time.time()
print(f"Time to recompute after refresh: {(end-start)*1000:.2f}ms")


Testing quarterly frequency scenario data...
--------------------------------------------------

1. Testing data structure and frequency:

Scenario set: EWST2024

Scenario: Adverse
Data frequency: Q-DEC

Variable counts:
- Base variables: 4
- Monthly-derived (_M): 2
- Quarterly-derived (_Q): 0

Variable categories:
a) Quarterly variables (no suffix):
['GDP', 'UNRATE', 'CPI']

b) Monthly-derived variables (_M):
['CPI_M', 'GDP_M']

Scenario: Base
Data frequency: Q-DEC

Variable counts:
- Base variables: 4
- Monthly-derived (_M): 2
- Quarterly-derived (_Q): 0

Variable categories:
a) Quarterly variables (no suffix):
['GDP', 'UNRATE', 'CPI']

b) Monthly-derived variables (_M):
['CPI_M', 'GDP_M']

2. Testing quarterly averaging of monthly data:

Checking averaging for quarter 2018-03-31 00:00:00:

GDP averaging check:
Monthly values: [101.53049673 101.19942478  99.98648015]
Calculated average: 100.9055
Value in quarterly data: 100.9055
Averages match: True

3. Testing specific variable hand



#### Test 3: Cache Behavior and Refresh


In [None]:
# Test cache behavior and refresh functionality
print("Testing Cache and Refresh Behavior")
print("-" * 50)

# First access to trigger cache creation
print("1. Initial access:")
scen_mevs1 = dm_mth.scen_mevs
base_scen1 = scen_mevs1['EWST2024']['Base']
print(f"Shape: {base_scen1.shape}")
print(f"Columns: {sorted(base_scen1.columns)}")

# Second access should use cache
print("\n2. Second access (should use cache):")
scen_mevs2 = dm_mth.scen_mevs
print(f"Same object: {scen_mevs1 is scen_mevs2}")  # Should be True due to caching

# Modify scenario data
print("\n3. Modifying scenario data:")
new_gdp = np.random.normal(110, 10, len(dates_qtr))
mev_loader._scen_mev_qtr['EWST2024']['Base']['GDP'] = new_gdp

# Access without refresh (should still use cache)
print("\nAccess without refresh:")
scen_mevs3 = dm_mth.scen_mevs
print(f"Still using cache: {scen_mevs1 is scen_mevs3}")  # Should be True

# Refresh and access again
print("\n4. After refresh:")
dm_mth.refresh()
scen_mevs4 = dm_mth.scen_mevs
print(f"New object after refresh: {scen_mevs1 is not scen_mevs4}")  # Should be True

# Verify data was actually updated
base_scen4 = scen_mevs4['EWST2024']['Base']
if 'GDP_Q' in base_scen4.columns:
    print("\n5. Verifying data update:")
    print("Original GDP_Q mean:", base_scen1['GDP_Q'].mean())
    print("Updated GDP_Q mean:", base_scen4['GDP_Q'].mean())
    print("Values changed:", not (base_scen1['GDP_Q'] == base_scen4['GDP_Q']).all())


Testing Cache and Refresh Behavior
--------------------------------------------------
1. Initial access:
Shape: (72, 8)
Columns: ['CPI', 'CPI_Q', 'GDP', 'GDP_Q', 'HOUSING', 'M', 'Q', 'UNRATE']

2. Second access (should use cache):
Same object: True

3. Modifying scenario data:

Access without refresh:
Still using cache: False

4. After refresh:
New object after refresh: True

5. Verifying data update:
Original GDP_Q mean: 113.1997281294566
Updated GDP_Q mean: 113.30053535825363
Values changed: True


#### Test 4: MEV Map Updates for Derived Variables


In [None]:
# Test MEV map updates for derived variables
print("Testing MEV Map Updates for Derived Variables")
print("-" * 50)

# Get MEV maps from both monthly and quarterly DataManagers
mev_map_mth = dm_mth.mev_map
mev_map_qtr = dm_qtr.mev_map

print("1. Monthly Frequency MEV Map:")
print("\nOriginal variables:")
for var in ['GDP', 'CPI', 'UNRATE', 'HOUSING']:
    if var in mev_map_mth:
        print(f"- {var}: {mev_map_mth[var]['description']}")

print("\nDerived variables (with _Q suffix):")
for var in ['GDP_Q', 'CPI_Q']:
    if var in mev_map_mth:
        print(f"- {var}: {mev_map_mth[var]['description']}")

print("\n2. Quarterly Frequency MEV Map:")
print("\nOriginal variables:")
for var in ['GDP', 'CPI', 'UNRATE', 'HOUSING']:
    if var in mev_map_qtr:
        print(f"- {var}: {mev_map_qtr[var]['description']}")

print("\nDerived variables (with _M suffix):")
for var in ['GDP_M', 'CPI_M']:
    if var in mev_map_qtr:
        print(f"- {var}: {mev_map_qtr[var]['description']}")

# Verify type preservation
print("\n3. Verifying type preservation:")
print("\nMonthly frequency:")
for base_var, derived_var in [('GDP', 'GDP_Q'), ('CPI', 'CPI_Q')]:
    if base_var in mev_map_mth and derived_var in mev_map_mth:
        print(f"- {base_var} type: {mev_map_mth[base_var]['type']}")
        print(f"- {derived_var} type: {mev_map_mth[derived_var]['type']}")

print("\nQuarterly frequency:")
for base_var, derived_var in [('GDP', 'GDP_M'), ('CPI', 'CPI_M')]:
    if base_var in mev_map_qtr and derived_var in mev_map_qtr:
        print(f"- {base_var} type: {mev_map_qtr[base_var]['type']}")
        print(f"- {derived_var} type: {mev_map_qtr[derived_var]['type']}")

# Verify non-overlapping variables don't have derived entries
print("\n4. Verifying non-overlapping variables:")
print("Monthly frequency:")
print(f"- UNRATE_Q exists: {'UNRATE_Q' in mev_map_mth}")
print(f"- HOUSING_Q exists: {'HOUSING_Q' in mev_map_mth}")

print("\nQuarterly frequency:")
print(f"- UNRATE_M exists: {'UNRATE_M' in mev_map_qtr}")
print(f"- HOUSING_M exists: {'HOUSING_M' in mev_map_qtr}")


Testing MEV Map Updates for Derived Variables
--------------------------------------------------
1. Monthly Frequency MEV Map:

Original variables:
- GDP: Gross Domestic Product
- CPI: Consumer Price Index
- UNRATE: Unemployment Rate
- HOUSING: Housing Index

Derived variables (with _Q suffix):
- GDP_Q: Gross Domestic Product (Interpolated from quarterly)
- CPI_Q: Consumer Price Index (Interpolated from quarterly)

2. Quarterly Frequency MEV Map:

Original variables:
- GDP: Gross Domestic Product
- CPI: Consumer Price Index
- UNRATE: Unemployment Rate
- HOUSING: Housing Index

Derived variables (with _M suffix):
- GDP_M: Gross Domestic Product (Averaged from monthly)
- CPI_M: Consumer Price Index (Averaged from monthly)

3. Verifying type preservation:

Monthly frequency:
- GDP type: level
- GDP_Q type: level
- CPI type: level
- CPI_Q type: level

Quarterly frequency:
- GDP type: level
- GDP_M type: level
- CPI type: level
- CPI_M type: level

4. Verifying non-overlapping variables:
Mo

### Test Scenario Handling Features


In [28]:
import numpy as np

# Create synthetic scenario data
dates = pd.date_range(start='2023-01-01', end='2025-12-31', freq='M')

# Function to create scenario data with some randomness
def create_scenario_data(base_values, scenario_factor):
    data = {}
    for col, base in base_values.items():
        # Add some random variation and trend based on scenario
        noise = np.random.normal(0, 0.1, len(dates))
        trend = np.linspace(0, scenario_factor, len(dates))
        values = base * (1 + noise + trend)
        data[col] = values
    return pd.DataFrame(data, index=dates)

# Base values for variables (ensure some overlap with internal data)
base_values = {
    'Fixed_balance': 2000,
    'VR_balance': 1500,
    'New_var1': 100,  # New variable not in internal data
    'New_var2': 200   # New variable not in internal data
}

# Create two scenario sets
scenario_sets = {
    'EWST2024': {
        'Base': create_scenario_data(base_values, 0.1),
        'Adverse': create_scenario_data(base_values, -0.2),
        'Severe': create_scenario_data(base_values, -0.4)
    },
    'EWST2025': {
        'Base': create_scenario_data(base_values, 0.15),
        'Adverse': create_scenario_data(base_values, -0.25)
    }
}

# Print sample of the scenario data
print("Sample of EWST2024 Base scenario:")
print(scenario_sets['EWST2024']['Base'].head())


Sample of EWST2024 Base scenario:
            Fixed_balance   VR_balance    New_var1    New_var2
2023-01-31    2366.987883  1801.524490   97.289476  190.043089
2023-02-28    2022.591697  1335.185328  104.060320  200.097264
2023-03-31    2679.154819  1410.885384  106.057025  234.706074
2023-04-30    1957.075594  1378.650889  113.856633  169.025615
2023-05-31    2134.052184  1492.980297  110.574169  209.999727


#### Test 1: Loading Scenarios from Dictionary


In [29]:
# Create a new loader instance
scen_loader = tc.PPNRInternalLoader(freq='M')

# Load the base internal data first
scen_loader.load(
    source='fake_internal.xlsx',
    date_col='Date'
)

# Test 1.1: Load full scenario sets dictionary
print("Test 1.1: Loading multiple scenario sets")
scen_loader.load_scens(source=scenario_sets)

# Verify the structure
print("\nScenario sets loaded:", list(scen_loader.scen_internal_data.keys()))
for set_name, scenarios in scen_loader.scen_internal_data.items():
    print(f"\nSet '{set_name}' scenarios:", list(scenarios.keys()))
    print(f"Columns in {set_name}/Base:", list(scenarios['Base'].columns))

# Test 1.2: Load single scenario set
print("\nTest 1.2: Loading single scenario set")
scen_loader.clean_scens()  # Clear previous scenarios
scen_loader.load_scens(
    source=scenario_sets['EWST2024'],
    set_name='EWST2024'
)

print("\nScenario sets after single set load:", list(scen_loader.scen_internal_data.keys()))
print("Scenarios in EWST2024:", list(scen_loader.scen_internal_data['EWST2024'].keys()))


Test 1.1: Loading multiple scenario sets

Scenario sets loaded: ['EWST2024', 'EWST2025']

Set 'EWST2024' scenarios: ['Base', 'Adverse', 'Severe']
Columns in EWST2024/Base: ['Fixed_balance', 'VR_balance', 'New_var1', 'New_var2']

Set 'EWST2025' scenarios: ['Base', 'Adverse']
Columns in EWST2025/Base: ['Fixed_balance', 'VR_balance', 'New_var1', 'New_var2']

Test 1.2: Loading single scenario set

Scenario sets after single set load: ['EWST2024']
Scenarios in EWST2024: ['Base', 'Adverse', 'Severe']


#### Test 2: Loading Scenarios from Excel and Validation Tests


In [30]:
# First save scenario data to Excel for testing
import os

# Create a temporary Excel file with scenarios
excel_path = 'test_scenarios.xlsx'
with pd.ExcelWriter(excel_path) as writer:
    # Save EWST2024 scenarios
    for scen_name, df in scenario_sets['EWST2024'].items():
        df.to_excel(writer, sheet_name=scen_name)

# Test 2.1: Load scenarios from Excel
print("Test 2.1: Loading scenarios from Excel")
scen_loader.clean_scens()  # Clear previous scenarios
scen_loader.load_scens(
    source=excel_path,
    set_name='EWST2024',
    date_col='Date'
)

print("\nScenarios loaded from Excel:", list(scen_loader.scen_internal_data['EWST2024'].keys()))

# Test 2.2: Validation of common columns
print("\nTest 2.2: Checking common columns")
common_cols = set(scen_loader.internal_data.columns) & set(scen_loader.scen_internal_data['EWST2024']['Base'].columns)
print("Common columns between internal data and scenarios:", common_cols)

# Test 2.3: Test column consistency validation
print("\nTest 2.3: Testing column consistency validation")
try:
    # Create inconsistent scenario data
    inconsistent_scenarios = {
        'Base': scenario_sets['EWST2024']['Base'],
        'Adverse': scenario_sets['EWST2024']['Adverse'].drop(columns=['New_var1'])  # Remove one column
    }
    scen_loader.load_scens(source=inconsistent_scenarios, set_name='Test')
    print("Warning: Inconsistent columns were accepted!")
except ValueError as e:
    print("Successfully caught inconsistent columns error:", str(e))

# Clean up
os.remove(excel_path)
print("\nTest file cleaned up")


Test 2.1: Loading scenarios from Excel


ValueError: scens mapping required when loading from Excel

### Test Scenario Handling Features


In [31]:
import numpy as np

# Create synthetic scenario data
dates = pd.date_range(start='2023-01-01', end='2025-12-31', freq='M')

# Function to create scenario data with some randomness
def create_scenario_data(base_values, scenario_factor):
    data = {}
    for col, base in base_values.items():
        # Add some random variation and trend based on scenario
        noise = np.random.normal(0, 0.1, len(dates))
        trend = np.linspace(0, scenario_factor, len(dates))
        values = base * (1 + noise + trend)
        data[col] = values
    return pd.DataFrame(data, index=dates)

# Base values for variables (ensure some overlap with internal data)
base_values = {
    'Fixed_balance': 2000,
    'VR_balance': 1500,
    'New_var1': 100,  # New variable not in internal data
    'New_var2': 200   # New variable not in internal data
}

# Create two scenario sets
scenario_sets = {
    'EWST2024': {
        'Base': create_scenario_data(base_values, 0.1),
        'Adverse': create_scenario_data(base_values, -0.2),
        'Severe': create_scenario_data(base_values, -0.4)
    },
    'EWST2025': {
        'Base': create_scenario_data(base_values, 0.15),
        'Adverse': create_scenario_data(base_values, -0.25)
    }
}

# Print sample of the scenario data
print("Sample of EWST2024 Base scenario:")
print(scenario_sets['EWST2024']['Base'].head())


Sample of EWST2024 Base scenario:
            Fixed_balance   VR_balance    New_var1    New_var2
2023-01-31    2047.924279  1660.643292  114.507924  195.406529
2023-02-28    1833.421146  1475.749255  105.604250  198.906914
2023-03-31    2057.460869  1419.838195  109.708775  194.978962
2023-04-30    2089.354231  1609.263986  110.625729  180.223019
2023-05-31    2020.098910  1654.033750  105.403965  181.899136


In [35]:
# First save scenario data to Excel for testing
import os

# Create a temporary Excel file with scenarios
excel_path = 'test_scenarios.xlsx'
with pd.ExcelWriter(excel_path) as writer:
    # Save EWST2024 scenarios
    for scen_name, df in scenario_sets['EWST2024'].items():
        df.to_excel(writer, sheet_name=scen_name, index_label='Date')

# Test 2.1: Load scenarios from Excel
print("Test 2.1: Loading scenarios from Excel")
scen_loader = tc.PPNRInternalLoader(freq='M')

# Load the base internal data first
scen_loader.load(
    source='fake_internal.xlsx',
    date_col='Date'
)

# Define mapping between Excel sheet names and scenario names
scenario_mapping = {
    'Base': 'Base',
    'Adverse': 'Adverse',
    'Severe': 'Severe'
}

scen_loader.load_scens(
    source=excel_path,
    scens=scenario_mapping,  # Specify the mapping between sheet names and scenario names
    set_name='EWST2024',
    date_col='Date'
)

print("\nScenarios loaded from Excel:", list(scen_loader.scen_internal_data['EWST2024'].keys()))

# Test 2.2: Validation of common columns
print("\nTest 2.2: Checking common columns")
common_cols = set(scen_loader.internal_data.columns) & set(scen_loader.scen_internal_data['EWST2024']['Base'].columns)
print("Common columns between internal data and scenarios:", common_cols)

# Test 2.3: Test column consistency validation
print("\nTest 2.3: Testing column consistency validation")
try:
    # Create inconsistent scenario data
    inconsistent_scenarios = {
        'Base': scenario_sets['EWST2024']['Base'],
        'Adverse': scenario_sets['EWST2024']['Adverse'].drop(columns=['New_var1'])  # Remove one column
    }
    scen_loader.load_scens(source=inconsistent_scenarios, set_name='Test')
    print("Warning: Inconsistent columns were accepted!")
except ValueError as e:
    print("Successfully caught inconsistent columns error:", str(e))

# Clean up
os.remove(excel_path)
print("\nTest file cleaned up")


Test 2.1: Loading scenarios from Excel

Scenarios loaded from Excel: ['Base', 'Adverse', 'Severe']

Test 2.2: Checking common columns
Common columns between internal data and scenarios: {'Fixed_balance', 'VR_balance'}

Test 2.3: Testing column consistency validation
Successfully caught inconsistent columns error: All scenarios in set 'Test' must have the same columns

Test file cleaned up


#### Summary of Scenario Handling Tests

The tests above verify the following functionality:

1. Scenario Data Loading
   - Loading multiple scenario sets from dictionary
   - Loading single scenario set from dictionary
   - Loading scenarios from Excel file
   
2. Data Structure Validation
   - Consistent columns across scenarios in a set
   - Common columns between internal data and scenarios
   - Proper error handling for inconsistent data

3. Data Management
   - Proper scenario set organization
   - Scenario cleaning functionality
   - Excel file handling and cleanup
