This notebook calculates utility scores using OOF predictions. 

The public test is expected to cover roughly one year, so the utility score calculation is including only 250 days out of 500 days in training data. Two options are considered for how to select the dates:
1. Randomly chosen 250 days
2. Consecutive 250 days

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [None]:
# loading OOF predictions from a separate notebook (training & inference not shown here)
# CV approach - GroupKFold based on dates
# 10 models * 5 target columns = 50 prediction columns
oof=pd.read_feather('../input/js-oof-predictions/preds')

In [None]:
oof

In [None]:
pred_cols=[col for col in oof.columns if 'pred' in col]

In [None]:
def utility_score_bincount(date, weight, resp, action):
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

In [None]:
# utility scores for randomly chosen 250 days
random_preds=[]

for _ in tqdm(range(250)):

    part_df=oof[oof.date.isin(np.random.choice(oof.date.unique(),size=250,replace=False))]#.copy()
       
    random_preds.append(utility_score_bincount(part_df.date.values,
                                           part_df.weight.values,
                                           part_df.resp.values,
                                           (part_df[pred_cols].mean(1).values>0.5).astype('int8')
                                           )
                    )

In [None]:
plt.title('OOF Utility Score Distribution - Random 250 Days')
plt.hist(random_preds,bins=50);

In [None]:
print('Mean OOF utility score (randomly chosen 250 days) = ', int(np.mean(random_preds)))

In [None]:
# utility scores for consecutive 250 days
conseq_preds=[]

for start_date in tqdm(range(250)):

    part_df=oof[(oof.date>=start_date)&(oof.date<start_date+250)]
       
    conseq_preds.append(utility_score_bincount(part_df.date.values,
                                           part_df.weight.values,
                                           part_df.resp.values,
                                           (part_df[pred_cols].mean(1).values>0.5).astype('int8')
                                           )
                    )

In [None]:
plt.title('OOF Utility Score Distribution - Consecutive 250 Days')
plt.hist(conseq_preds,bins=50);

In [None]:
print('Mean utility score (consequtive 250 days) = ', int(np.mean(conseq_preds)))