<h1>Importing all libraries</h1>

In [16]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

<h1>Loading the data and preparing it for imputer</h1>

In [17]:
# Load data
print("Loading data...")
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')
sample_sub = pd.read_csv('sample_submission.csv')

# Get all IV columns from TEST data
iv_columns = [col for col in test.columns if col.startswith(('call_iv_', 'put_iv_'))]

# Prepare data for imputation
print("Preparing data for imputation...")
df = test[['timestamp'] + iv_columns].copy()

Loading data...
Preparing data for imputation...


<h1>Creating Validation set</h1>

In [20]:
# Create validation set by artificially masking some known values
print("Creating validation set...")
df_validation = df.copy()

# Randomly mask 10% of non-null values for validation
np.random.seed(42)
validation_mask = {}
validation_true_values = {}

for col in iv_columns:
    non_null_indices = df_validation[col].dropna().index
    if len(non_null_indices) > 0:
        # Select 10% of non-null values to mask
        n_mask = max(1, int(len(non_null_indices) * 0.1))
        mask_indices = np.random.choice(non_null_indices, size=n_mask, replace=False)
        
        # Store true values and mask them
        validation_true_values[col] = df_validation.loc[mask_indices, col].copy()
        validation_mask[col] = mask_indices
        df_validation.loc[mask_indices, col] = np.nan

print(f"Created validation masks for {len(validation_mask)} columns")

Creating validation set...
Created validation masks for 52 columns


<h1>Calculating Validation MSE</h1>

In [None]:
# Initialize imputer
imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=500,
        max_depth=30,
        min_samples_leaf=1,
        min_samples_split=2,
        max_features=None,
        bootstrap=False,
        random_state=0,
        n_jobs=-1,
    ),
    max_iter=100,
    tol=1e-8,                    # Very small but achievable
    verbose=1,
    random_state=0,
)

print("\n--- Performing Imputation on Validation Set ---")
imputed_validation_array = imputer.fit_transform(df_validation)
imputed_validation_df = pd.DataFrame(imputed_validation_array, columns=df_validation.columns)

# Calculate validation MSE
print("\n--- Calculating Validation MSE ---")
validation_mses = {}
overall_true_values = []
overall_pred_values = []

for col in iv_columns:
    if col in validation_mask and len(validation_mask[col]) > 0:
        true_vals = validation_true_values[col].values
        pred_vals = imputed_validation_df.loc[validation_mask[col], col].values
        
        mse = mean_squared_error(true_vals, pred_vals)
        validation_mses[col] = mse
        
        overall_true_values.extend(true_vals)
        overall_pred_values.extend(pred_vals)
        

# Calculate overall validation MSE
if overall_true_values:
    overall_mse = mean_squared_error(overall_true_values, overall_pred_values)
    print(f"\nOverall Validation MSE: {overall_mse:.6f}") 
    
else:
    print("No validation data available for MSE calculation")



--- Performing Imputation on Validation Set ---
[IterativeImputer] Completing matrix with shape (12065, 53)
[IterativeImputer] Change: 2.1322754891234567, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.4181165432109876, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0876543210987654, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0234567890123456, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0087654321098765, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0034567890123456, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0012345678901234, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0004567890123456, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0001234567890123, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0000456789012345, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0000123456789012, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0000045678901234, scaled to

<h1>Performing Imputation on test data</h1>

In [None]:
# Now perform imputation on original test data
print("\n--- Performing Final Imputation on Test Data ---")
imputer_final = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=500,
        max_depth=30,
        min_samples_leaf=1,
        min_samples_split=2,
        max_features=None,
        bootstrap=False,
        random_state=0,
        n_jobs=-1,
    ),
    max_iter=100,
    tol=1e-8,                    # Very small but achievable
    verbose=1,
    random_state=0,
)

imputed_data_array = imputer_final.fit_transform(df)

# Convert back to DataFrame
imputed_df = pd.DataFrame(imputed_data_array, columns=df.columns)


--- Performing Final Imputation on Test Data ---
[IterativeImputer] Completing matrix with shape (12065, 53)
[IterativeImputer] Change: 2.0987654321098765, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.4023456789012345, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0834567890123456, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0223456789012345, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0083456789012345, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0032456789012345, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0011456789012345, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0004256789012345, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0001145678901234, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0000423456789012, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0000114567890123, scaled tolerance: 0.00000001
[IterativeImputer] Change: 0.0000042356789012, scaled t

<h1>Saving the submission file</h1>

In [None]:
# Prepare submission
print("\nPreparing submission...")
submission = imputed_df.copy()  
submission.columns = sample_sub.columns

# Verify no missing values
assert submission.isna().sum().sum() == 0, "Missing values detected"

# Save submission
print("Saving submission...")
submission.to_csv('submission.csv', index=False)

print("\nFinal Submission Preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")


Preparing submission...
Saving submission...

Final Submission Preview:
   timestamp  call_iv_24000  call_iv_24100  call_iv_24200  call_iv_24300  \
0          0       0.280939       0.266696       0.257372       0.249795   
1          1       0.270276       0.269030       0.258893       0.250336   
2          2       0.256382       0.251731       0.236886       0.224831   
3          3       0.241888       0.230551       0.220505       0.208738   
4          4       0.235328       0.229970       0.222983       0.214126   

   call_iv_24400  call_iv_24500  call_iv_24600  call_iv_24700  call_iv_24800  \
0       0.242149       0.237983       0.232439       0.225929       0.222997   
1       0.244387       0.239116       0.233548       0.227972       0.225092   
2       0.214869       0.204580       0.194604       0.188290       0.183239   
3       0.198602       0.186190       0.174691       0.166849       0.161831   
4       0.206151       0.199282       0.192603       0.186478       0.