In [2]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Labels
labels = ['patient_overall', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7']

# Compute Laplace-smoothed priors
priors = {}
for label in labels:
    pos = train[label].sum()
    total = len(train)
    prior = (pos + 1) / (total + 2)
    priors[label] = prior
    print(f'{label}: {prior:.4f}')

# Map to test
submission = test.copy()
submission['fractured'] = submission['prediction_type'].map(priors)

# Clip probabilities
submission['fractured'] = submission['fractured'].clip(1e-6, 1 - 1e-6)

# Save submission
submission[['row_id', 'fractured']].to_csv('submission.csv', index=False)
print('\nSubmission saved. Shape:', submission.shape)
print('\nSubmission head:')
print(submission.head())

# Simulate CV log loss on train (using priors as predictions)
from sklearn.metrics import log_loss

train_preds = np.column_stack([train[label].map(lambda x: priors[label]) for label in labels])
train_true = train[labels].values

# Weighted log loss: patient_overall weight 2, others 1
weights = np.array([2.0] + [1.0] * 7)
per_label_ll = [log_loss(train_true[:, i], train_preds[:, i]) for i in range(8)]
weighted_ll = np.average(per_label_ll, weights=weights)
print(f'\nSimulated weighted log loss on train: {weighted_ll:.4f}')

# Note: This is not true CV, but gives baseline metric

patient_overall: 0.4657
C1: 0.1029
C2: 0.1471
C3: 0.0490
C4: 0.0588
C5: 0.1127
C6: 0.1716
C7: 0.1667

Submission saved. Shape: (14536, 4)

Submission head:
            StudyInstanceUID  prediction_type  \
0   1.2.826.0.1.3680043.6200  patient_overall   
1  1.2.826.0.1.3680043.27262  patient_overall   
2  1.2.826.0.1.3680043.12351  patient_overall   
3   1.2.826.0.1.3680043.1363  patient_overall   
4   1.2.826.0.1.3680043.4859  patient_overall   

                                      row_id  fractured  
0   1.2.826.0.1.3680043.6200_patient_overall   0.465686  
1  1.2.826.0.1.3680043.27262_patient_overall   0.465686  
2  1.2.826.0.1.3680043.12351_patient_overall   0.465686  
3   1.2.826.0.1.3680043.1363_patient_overall   0.465686  
4   1.2.826.0.1.3680043.4859_patient_overall   0.465686  

Simulated weighted log loss on train: 0.4170
