In [1]:
import os
import pandas as pd
from pathlib import Path

from sklearn.metrics import log_loss

path = Path('/kaggle/input/tabular-playground-series-nov-2022/')

### Read files and info

In [2]:
submission = pd.read_csv(path / 'sample_submission.csv', index_col='id')
labels = pd.read_csv(path / 'train_labels.csv', index_col='id')

# the ids of the submission rows (useful later)
sub_ids = submission.index

# the ids of the labeled rows (useful later)
gt_ids = labels.index 

# list of files in the submission folder
subs = sorted(os.listdir(path / 'submission_files'))

### Read in the first submission file (best scoring on labeled rows)

In [3]:
s0 = pd.read_csv(path / 'submission_files' / subs[0], index_col='id')

score = log_loss(labels, s0.loc[gt_ids])

# Notice the score of the labeled rows matches the file name
print(subs[0], f'{score:.10f}')

0.6222863195.csv 0.6222863195


### Same for second submission file

In [4]:
s1 = pd.read_csv(path / 'submission_files' / subs[1], index_col='id')

score = log_loss(labels, s1.loc[gt_ids])

print(subs[1], f'{score:.10f}')

0.6223807245.csv 0.6223807245


### Blending the two files

Blending `s0` and `s1` gives a local score of 0.60497, which is an improvement from the input files (0.622...)


In [5]:
blend = (s0 + s1) / 2

score = log_loss(labels, blend.loc[gt_ids])

print(f'blend score: {score:.10f}')

blend score: 0.6049715910


### How does the blend do on the Leaderboard?

If you submit the unlabeled rows of `s0` (e.g., 20,000 - 39,000) to the leaderboard, it scores 0.61863. Likewise, `s1` scores 0.62335. How does the blend do?

#### It scores 0.60454, which is an improvement!

In [6]:
blend.loc[sub_ids].to_csv('blend.csv')