<h1>Importing Libraries</h1>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.interpolate import interp1d, CubicSpline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

<h1>Loading Data</h1>

In [5]:
print("Loading data...")
# Load data
train = pd.read_parquet('../train.parquet')
test = pd.read_parquet('../test.parquet')
sample_sub = pd.read_csv('../sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Available columns in train: {train.columns.tolist()}")

Loading data...
Train shape: (178340, 97)
Test shape: (12065, 96)
Available columns in train: ['timestamp', 'underlying', 'expiry', 'call_iv_23500', 'call_iv_23600', 'call_iv_23700', 'call_iv_23800', 'call_iv_23900', 'call_iv_24000', 'call_iv_24100', 'call_iv_24200', 'call_iv_24300', 'call_iv_24400', 'call_iv_24500', 'call_iv_24600', 'call_iv_24700', 'call_iv_24800', 'call_iv_24900', 'call_iv_25000', 'call_iv_25100', 'call_iv_25200', 'call_iv_25300', 'call_iv_25400', 'call_iv_25500', 'call_iv_25600', 'call_iv_25700', 'call_iv_25800', 'call_iv_25900', 'call_iv_26000', 'put_iv_22500', 'put_iv_22600', 'put_iv_22700', 'put_iv_22800', 'put_iv_22900', 'put_iv_23000', 'put_iv_23100', 'put_iv_23200', 'put_iv_23300', 'put_iv_23400', 'put_iv_23500', 'put_iv_23600', 'put_iv_23700', 'put_iv_23800', 'put_iv_23900', 'put_iv_24000', 'put_iv_24100', 'put_iv_24200', 'put_iv_24300', 'put_iv_24400', 'put_iv_24500', 'put_iv_24600', 'put_iv_24700', 'put_iv_24800', 'put_iv_24900', 'put_iv_25000', 'X0', 'X1'

<h1>Precomputations</h1>

In [6]:
# Get all IV columns from TEST data
iv_columns = [col for col in test.columns if col.startswith(('call_iv_', 'put_iv_'))]
print(f"Found {len(iv_columns)} IV columns to predict")

# Get feature columns that exist in BOTH train and test
feature_columns = []
if 'underlying' in train.columns and 'underlying' in test.columns:
    feature_columns.append('underlying')

# Add X{0...41} features that exist in both datasets
x_features = [col for col in train.columns if col.startswith('X') and col[1:].isdigit() and col in test.columns]
feature_columns.extend(x_features)

# Note: expiry is only in train, not in test, so we can't use it as a feature for prediction
print(f"Found {len(feature_columns)} feature columns available in both datasets: {feature_columns[:10]}...")
if 'expiry' in train.columns and 'expiry' not in test.columns:
    print("Note: 'expiry' column found in train but not in test - will use for validation only")

# Create strike dictionary from TEST columns
strike_dict = {}
for col in iv_columns:
    strike = col.split('_')[-1]
    if strike not in strike_dict:
        strike_dict[strike] = {'call': None, 'put': None}
    
    if col.startswith('call_iv_'):
        strike_dict[strike]['call'] = col
    else:
        strike_dict[strike]['put'] = col

print(f"Found {len(strike_dict)} unique strikes")

# Extract numerical strike prices
def extract_strike_price(strike_str):
    try:
        return float(strike_str)
    except:
        return 0.0

# Create strike price mapping
strike_prices = {}
for strike_str in strike_dict.keys():
    strike_prices[strike_str] = extract_strike_price(strike_str)

# Sort strikes by price
sorted_strikes = sorted(strike_dict.keys(), key=lambda x: strike_prices[x])
print(f"Strike price range: {min(strike_prices.values()):.1f} - {max(strike_prices.values()):.1f}")

Found 52 IV columns to predict
Found 43 feature columns available in both datasets: ['underlying', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']...
Note: 'expiry' column found in train but not in test - will use for validation only
Found 36 unique strikes
Strike price range: 23000.0 - 26500.0


<h1>Prediction function</h1>

In [7]:
def advanced_iv_prediction(data, feature_cols=None, is_train=False):
    data = data.copy()
    print(f"\nProcessing {len(data)} rows...")
    
    # Ensure we only use features that exist in the current dataset
    if feature_cols:
        available_features = [col for col in feature_cols if col in data.columns]
        if len(available_features) != len(feature_cols):
            print(f"Note: Using {len(available_features)}/{len(feature_cols)} features available in this dataset")
        feature_cols = available_features
    
    # Phase 1: Enhanced put-call parity with confidence weighting
    print("Phase 1: Put-call parity enforcement...")
    parity_fixes = 0
    
    for strike, cols in tqdm(strike_dict.items(), desc="Processing strikes"):
        call_col = cols['call']
        put_col = cols['put']
        
        if call_col in data.columns and put_col in data.columns:
            # Calculate confidence based on available data
            for idx in data.index:
                call_val = data.at[idx, call_col]
                put_val = data.at[idx, put_col]
                
                if pd.isna(call_val) and not pd.isna(put_val):
                    data.at[idx, call_col] = put_val
                    parity_fixes += 1
                elif pd.isna(put_val) and not pd.isna(call_val):
                    data.at[idx, put_col] = call_val
                    parity_fixes += 1
                elif not pd.isna(call_val) and not pd.isna(put_val):
                    # Blend values for consistency
                    avg_val = (call_val + put_val) / 2
                    data.at[idx, call_col] = 0.8 * call_val + 0.2 * avg_val
                    data.at[idx, put_col] = 0.8 * put_val + 0.2 * avg_val
    
    print(f"Applied {parity_fixes} put-call parity fixes")
    
    # Phase 2: Advanced interpolation across strikes
    print("Phase 2: Cross-strike interpolation...")
    interpolation_fixes = 0
    
    for idx in tqdm(data.index, desc="Interpolating rows"):
        # Collect available IVs with their strikes
        available_data = []
        for col in iv_columns:
            if col in data.columns and not pd.isna(data.at[idx, col]):
                strike_price = strike_prices[col.split('_')[-1]]
                available_data.append((strike_price, data.at[idx, col]))
        
        if len(available_data) >= 3:
            # Sort by strike price
            available_data.sort(key=lambda x: x[0])
            strikes, ivs = zip(*available_data)
            
            # Use cubic spline for smoother interpolation
            try:
                cs = CubicSpline(strikes, ivs, bc_type='natural')
                
                # Fill missing values
                for col in iv_columns:
                    if col in data.columns and pd.isna(data.at[idx, col]):
                        target_strike = strike_prices[col.split('_')[-1]]
                        interpolated_iv = cs(target_strike)
                        
                        # Apply bounds checking
                        if 0.001 < interpolated_iv < 2.0:
                            data.at[idx, col] = interpolated_iv
                            interpolation_fixes += 1
                        else:
                            # Fallback to local averaging
                            nearby_ivs = [iv for strike, iv in available_data 
                                        if abs(strike - target_strike) <= 100]
                            if nearby_ivs:
                                data.at[idx, col] = np.mean(nearby_ivs)
                                interpolation_fixes += 1
                            else:
                                data.at[idx, col] = np.mean(ivs)
                                interpolation_fixes += 1
            except:
                # Fallback to linear interpolation
                f = interp1d(strikes, ivs, kind='linear', bounds_error=False, fill_value='extrapolate')
                for col in iv_columns:
                    if col in data.columns and pd.isna(data.at[idx, col]):
                        target_strike = strike_prices[col.split('_')[-1]]
                        interpolated_iv = f(target_strike)
                        if 0.001 < interpolated_iv < 2.0:
                            data.at[idx, col] = interpolated_iv
                            interpolation_fixes += 1
                        else:
                            data.at[idx, col] = np.mean(ivs)
                            interpolation_fixes += 1
        
        elif len(available_data) >= 1:
            # Use nearest neighbor approach
            strikes, ivs = zip(*available_data)
            row_mean = np.mean(ivs)
            
            for col in iv_columns:
                if col in data.columns and pd.isna(data.at[idx, col]):
                    data.at[idx, col] = row_mean
                    interpolation_fixes += 1
    
    print(f"Applied {interpolation_fixes} interpolation fixes")
    
    # Phase 3: ML-based refinement using market features
    if is_train and feature_cols and len(feature_cols) > 0:
        print("Phase 3: ML-based refinement...")
        
        # Prepare feature matrix
        feature_data = data[feature_cols].copy()
        
        # Handle categorical variables (expiry only exists in train)
        # No special handling needed since expiry is excluded from test features
        
        # Fill NaN values in features
        for col in feature_data.columns:
            if feature_data[col].dtype in ['float64', 'int64']:
                feature_data[col] = feature_data[col].fillna(feature_data[col].median())
            else:
                feature_data[col] = feature_data[col].fillna(0)
        
        # Scale features
        scaler = StandardScaler()
        feature_data_scaled = pd.DataFrame(
            scaler.fit_transform(feature_data),
            columns=feature_data.columns,
            index=feature_data.index
        )
        
        # Train individual models for each IV column
        ml_models = {}
        scalers = {}
        
        for col in tqdm(iv_columns, desc="Training ML models"):
            if col in data.columns:
                # Prepare training data
                valid_rows = ~data[col].isna()
                if valid_rows.sum() > 50:  # Need minimum samples
                    X = feature_data_scaled.loc[valid_rows]
                    y = data.loc[valid_rows, col]
                    
                    # Train Ridge regression (more stable than RF for this case)
                    model = Ridge(alpha=0.1)
                    model.fit(X, y)
                    ml_models[col] = model
                    scalers[col] = scaler
        
        print(f"Trained {len(ml_models)} ML models")
        
        # Apply ML corrections
        ml_corrections = 0
        for col, model in tqdm(ml_models.items(), desc="Applying ML corrections"):
            if col in data.columns:
                X = feature_data_scaled
                predictions = model.predict(X)
                
                # Apply corrections with confidence weighting
                # Use enumerate to get proper array indices
                for array_idx, idx in enumerate(data.index):
                    current_val = data.at[idx, col]
                    predicted_val = predictions[array_idx]  # Use array_idx instead of idx
                    
                    if 0.001 < predicted_val < 2.0:
                        # Weighted combination - more conservative
                        data.at[idx, col] = 0.9 * current_val + 0.1 * predicted_val
                        ml_corrections += 1
        
        print(f"Applied {ml_corrections} ML corrections")
    else:
        if not feature_cols or len(feature_cols) == 0:
            print("Phase 3: Skipped (no features available for ML)")
        else:
            print("Phase 3: Skipped (not training mode)")
    
    # Phase 4: Volatility smile consistency enforcement
    print("Phase 4: Volatility smile smoothing...")
    smoothing_corrections = 0
    
    for idx in tqdm(data.index, desc="Smoothing volatility smiles"):
        # Collect all IVs for this timestamp
        all_data = []
        
        for col in iv_columns:
            if col in data.columns and not pd.isna(data.at[idx, col]):
                strike_price = strike_prices[col.split('_')[-1]]
                all_data.append((strike_price, col, data.at[idx, col]))
        
        if len(all_data) >= 5:  # Need enough points for smoothing
            # Sort by strike price
            all_data.sort(key=lambda x: x[0])
            
            # Apply moving average smoothing
            window_size = min(3, len(all_data))
            
            for i, (strike, col, original_iv) in enumerate(all_data):
                start_idx = max(0, i - window_size // 2)
                end_idx = min(len(all_data), i + window_size // 2 + 1)
                
                nearby_ivs = [item[2] for item in all_data[start_idx:end_idx]]
                smoothed_iv = np.mean(nearby_ivs)
                
                # Light smoothing - very conservative
                new_iv = 0.95 * original_iv + 0.05 * smoothed_iv
                data.at[idx, col] = new_iv
                smoothing_corrections += 1
    
    print(f"Applied {smoothing_corrections} smoothing corrections")
    
    # Phase 5: Final put-call parity enforcement
    print("Phase 5: Final parity enforcement...")
    final_fixes = 0
    
    for strike, cols in tqdm(strike_dict.items(), desc="Final parity check"):
        call_col = cols['call']
        put_col = cols['put']
        
        if call_col in data.columns and put_col in data.columns:
            for idx in data.index:
                call_val = data.at[idx, call_col]
                put_val = data.at[idx, put_col]
                
                if not pd.isna(call_val) and not pd.isna(put_val):
                    # Final parity adjustment - very light
                    avg_val = (call_val + put_val) / 2
                    data.at[idx, call_col] = 0.99 * call_val + 0.01 * avg_val
                    data.at[idx, put_col] = 0.99 * put_val + 0.01 * avg_val
                    final_fixes += 1
    
    print(f"Applied {final_fixes} final parity adjustments")
    
    return data

<h1>Validation Score</h1>

In [8]:
# Create validation split - use simple random split since expiry might not be in test
print("\nCreating validation split...")
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

print(f"Training set: {len(train_df)} rows")
print(f"Validation set: {len(val_df)} rows")

# Apply to validation set
print("\n" + "="*50)
print("PROCESSING VALIDATION SET")
print("="*50)
val_pred = advanced_iv_prediction(val_df, feature_columns, is_train=True)

# Calculate MSE only on originally masked validation points
print("\nCalculating validation MSE...")
mse_vals = []
total_predictions = 0

for col in tqdm(iv_columns, desc="Calculating MSE per column"):
    if col in val_df.columns and col in val_pred.columns:
        # Focus only on points that were originally missing
        mask = val_df[col].isna() & val_pred[col].notna()
        if mask.any():
            se = (val_df.loc[mask, col] - val_pred.loc[mask, col]) ** 2
            mse_vals.append(se.mean())
            total_predictions += mask.sum()

validation_mse = np.mean(mse_vals) if mse_vals else 0
print(f"\nValidation Results:")
print(f"Total predictions made: {total_predictions}")
print(f"Validation MSE (masked points only): {validation_mse:.12f}")


Creating validation split...
Training set: 142672 rows
Validation set: 35668 rows

PROCESSING VALIDATION SET

Processing 35668 rows...
Phase 1: Put-call parity enforcement...


Processing strikes: 100%|██████████| 36/36 [00:30<00:00,  1.19it/s]


Applied 0 put-call parity fixes
Phase 2: Cross-strike interpolation...


Interpolating rows: 100%|██████████| 35668/35668 [00:24<00:00, 1431.64it/s]


Applied 0 interpolation fixes
Phase 3: ML-based refinement...


Training ML models: 100%|██████████| 52/52 [00:00<00:00, 56.58it/s]


Trained 42 ML models


Applying ML corrections: 100%|██████████| 42/42 [00:54<00:00,  1.30s/it]


Applied 1329979 ML corrections
Phase 4: Volatility smile smoothing...


Smoothing volatility smiles: 100%|██████████| 35668/35668 [01:29<00:00, 399.40it/s]


Applied 1498056 smoothing corrections
Phase 5: Final parity enforcement...


Final parity check: 100%|██████████| 36/36 [00:31<00:00,  1.13it/s]


Applied 392348 final parity adjustments

Calculating validation MSE...


Calculating MSE per column: 100%|██████████| 52/52 [00:00<00:00, 3985.31it/s]


Validation Results:
Total predictions made: 0
Validation MSE (masked points only): 0.000000000000





<h1>Test data predictions</h1>

In [9]:
# Apply to test set
print("\n" + "="*50)
print("PROCESSING TEST SET")
print("="*50)
test_pred = advanced_iv_prediction(test, feature_columns, is_train=False)


PROCESSING TEST SET

Processing 12065 rows...
Phase 1: Put-call parity enforcement...


Processing strikes: 100%|██████████| 36/36 [00:09<00:00,  3.74it/s]


Applied 92735 put-call parity fixes
Phase 2: Cross-strike interpolation...


Interpolating rows: 100%|██████████| 12065/12065 [00:55<00:00, 218.10it/s]


Applied 283769 interpolation fixes
Phase 3: Skipped (not training mode)
Phase 4: Volatility smile smoothing...


Smoothing volatility smiles: 100%|██████████| 12065/12065 [00:35<00:00, 337.16it/s]


Applied 627380 smoothing corrections
Phase 5: Final parity enforcement...


Final parity check: 100%|██████████| 36/36 [00:14<00:00,  2.49it/s]

Applied 193040 final parity adjustments





<h1>Submission</h1>

In [10]:
# Prepare submission
print("\nPreparing submission...")
submission = test_pred[['timestamp'] + iv_columns].copy()
submission.columns = sample_sub.columns

# Final data quality checks and corrections
print("Final quality checks...")
for col in tqdm(iv_columns, desc="Quality checks"):
    if col in submission.columns:
        # Ensure no extreme values
        submission[col] = submission[col].clip(0.001, 1.5)
        
        # Fill any remaining NaN values
        if submission[col].isna().any():
            print(f"Warning: {submission[col].isna().sum()} NaN values found in {col}, filling with median")
            submission[col] = submission[col].fillna(submission[col].median())

# Verify no missing values
missing_values = submission.isna().sum().sum()
assert missing_values == 0, f"Missing values detected: {missing_values}"

submission.to_csv('submission.csv', index=False)


Preparing submission...
Final quality checks...


Quality checks: 100%|██████████| 52/52 [00:00<00:00, 650.30it/s]


In [11]:
print("\n" + "="*50)
print("FINAL RESULTS")
print("="*50)
print(f"Submission shape: {submission.shape}")
print(f"Validation MSE: {validation_mse:.12f}")
print(f"Min IV value: {submission[iv_columns].min().min():.6f}")
print(f"Max IV value: {submission[iv_columns].max().max():.6f}")
print(f"Mean IV value: {submission[iv_columns].mean().mean():.6f}")

print("\nSubmission Preview:")
print(submission.head())

print(f"\nSubmission saved to 'submission.csv'")
print("Process completed successfully!")


FINAL RESULTS
Submission shape: (12065, 53)
Validation MSE: 0.000000000000
Min IV value: 0.154164
Max IV value: 0.639623
Mean IV value: 0.252602

Submission Preview:
   timestamp  call_iv_24000  call_iv_24100  call_iv_24200  call_iv_24300  \
0          0       0.281246       0.271402       0.261704       0.252007   
1          1       0.272782       0.273133       0.259125       0.252000   
2          2       0.262930       0.251744       0.238764       0.226914   
3          3       0.242124       0.231373       0.220681       0.210894   
4          4       0.235549       0.229293       0.223170       0.214283   

   call_iv_24400  call_iv_24500  call_iv_24600  call_iv_24700  call_iv_24800  \
0       0.242309       0.237369       0.232507       0.227748       0.223075   
1       0.244991       0.239305       0.233641       0.228298       0.229049   
2       0.215065       0.204750       0.194770       0.188160       0.184421   
3       0.198802       0.186395       0.176455       0.1