In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

from sklearn.metrics import roc_auc_score

Inspired by technique described here https://stats.stackexchange.com/questions/83296/which-distribution-does-the-data-point-belong-to

We will first try this technique without any rounding. Then we will gradually increase rounding.

In [None]:
tr = pd.read_csv('../input/train.csv')
te = pd.read_csv('../input/test.csv')

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
feats = [col for col in tr.columns if col not in ['ID_code','target']]

In [None]:
# Round 0
tr[feats] = tr[feats].round(0)
te[feats] = te[feats].round(0)

In [None]:
tr.head()

In [None]:
oofs = pd.Series(np.zeros(len(tr)))
full_preds = pd.Series(np.zeros(len(te)))


for train_index, valid_index in skf.split(tr,tr['target']):
    X_tr = tr.iloc[train_index]
    X_va = tr.iloc[valid_index]
    fold_preds = pd.Series(np.zeros(len(te)))
    
    base_distribution = len(X_tr['target']==1)/len(X_tr)
    
    for feat in feats:
        negative_distribution = X_tr.loc[X_tr['target']==0,feat].value_counts().to_frame('count').to_dict()['count']
        positive_distribution = X_tr.loc[X_tr['target']==1,feat].value_counts().to_frame('count').to_dict()['count']

        neg_count = X_va[feat].map(negative_distribution).fillna(0)
        pos_count = X_va[feat].map(positive_distribution).fillna(0)
        
        oofs[valid_index] = oofs[valid_index] +  (pos_count / (neg_count + pos_count)).fillna(base_distribution)
        
        neg_count_te = te[feat].map(negative_distribution).fillna(0)
        pos_count_te = te[feat].map(positive_distribution).fillna(0)
        
        fold_preds = fold_preds + (pos_count_te / (neg_count_te + pos_count_te)).fillna(base_distribution)
        
    oofs[valid_index] = oofs[valid_index] * (1/200)
    fold_preds = fold_preds * (1/200)
    
    full_preds = full_preds + fold_preds
        
    print('ROC AUC:',roc_auc_score(X_va['target'],oofs[valid_index]))
    
full_preds = full_preds * 1/5
print('overall ROC AUC:',roc_auc_score(tr['target'],oofs))

In [None]:
tr['target'] = oofs
tr[['ID_code','target']].to_csv('frequentist_oof_round0.csv', index=False)

te['target'] = full_preds
te[['ID_code','target']].to_csv('frequentist_sub_round0.csv', index=False)