In [None]:
import pandas as pd
import numpy as np

def rmspe(y_true, y_pred):
    mask = y_true != 0
    y_true_masked = y_true[mask]
    y_pred_masked = y_pred[mask]
    return np.sqrt(np.mean(((y_true_masked - y_pred_masked) / y_true_masked) ** 2)) * 100

# Load test data and predictions
test_data = pd.read_csv('test.csv')
predictions = pd.read_csv('submission.csv')

# Merge to align
merged = test_data.merge(predictions, on='ID')
y_true = merged['salary_average_x']
y_pred = merged['salary_average_y']

# Calculate test RMSPE
test_rmspe = rmspe(y_true, y_pred)

print(f'Test RMSPE (Distilled Linear): {test_rmspe:.4f}%')

# Also check teacher model
teacher_pred = pd.read_csv('submission_teacher.csv')
merged_teacher = test_data.merge(teacher_pred, on='ID')
y_pred_teacher = merged_teacher['salary_average_y']
teacher_test_rmspe = rmspe(y_true, y_pred_teacher)

print(f'Test RMSPE (Teacher GB): {teacher_test_rmspe:.4f}%')

# Check other models
try:
    optimized_pred = pd.read_csv('submission_optimized.csv')
    merged_opt = test_data.merge(optimized_pred, on='ID')
    y_pred_opt = merged_opt['salary_average_y']
    opt_test_rmspe = rmspe(y_true, y_pred_opt)
    print(f'Test RMSPE (Optimized Linear): {opt_test_rmspe:.4f}%')
except:
    print('Optimized model predictions not found')

print(f'\nCV vs Test Performance:')
print(f'Student (Distilled) - CV: 7.7466% → Test: {test_rmspe:.4f}%')
print(f'Teacher (GB) - CV: 1.6763% → Test: {teacher_test_rmspe:.4f}%')

print(f'\nTest Data Statistics:')
print(f'Number of test samples: {len(y_true)}')
print(f'True values - Min: {y_true.min():.2f}, Max: {y_true.max():.2f}, Mean: {y_true.mean():.2f}')
print(f'Predicted values - Min: {y_pred.min():.2f}, Max: {y_pred.max():.2f}, Mean: {y_pred.mean():.2f}')