In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from pathlib import Path
from scipy.special import logit

IS_LOCAL = False
if(IS_LOCAL):
    PATH = Path('/mnt/disks/data/santander-customer-transaction/')
else:
    PATH = Path('../input/santander-customer-transaction-prediction')
    

In [3]:
%%time
train_df = pd.read_csv(str(PATH / 'train.csv'))
test_df = pd.read_csv(str(PATH / 'test.csv'))
features = [x for x in train_df.columns if x.startswith("var")]

CPU times: user 17 s, sys: 776 ms, total: 17.8 s
Wall time: 17.8 s


In [5]:
hist_df = pd.DataFrame()
for var in features:
    var_stats = train_df[var].append(test_df[var]).value_counts()
    hist_df[var] = pd.Series(test_df[var]).map(var_stats)
    hist_df[var] = hist_df[var] > 1


In [8]:
hist_df.sum(axis=1)

0         200
1         200
2         200
3         184
4         200
5         200
6         200
7         185
8         200
9         200
10        200
11        183
12        200
13        200
14        200
15        189
16        189
17        181
18        186
19        200
20        186
21        187
22        192
23        200
24        185
25        200
26        200
27        200
28        200
29        185
         ... 
199970    184
199971    200
199972    190
199973    190
199974    183
199975    200
199976    188
199977    200
199978    200
199979    200
199980    194
199981    200
199982    193
199983    186
199984    186
199985    179
199986    186
199987    200
199988    200
199989    200
199990    200
199991    200
199992    200
199993    187
199994    200
199995    192
199996    187
199997    200
199998    200
199999    187
Length: 200000, dtype: int64

In [9]:
ind = hist_df.sum(axis=1) != 200
var_stats = {var:train_df[var].append(test_df[ind][var]).value_counts() for var in features}
pred = 0


In [14]:
np.hstack([train_df[var].values.reshape(-1,1), train_df[var].map(var_stats[var]).values.reshape(-1,1)])

array([[ -1.0914,   2.    ],
       [  1.9518,   2.    ],
       [  0.3965,   2.    ],
       ...,
       [  4.1995,   1.    ],
       [-13.9001,   1.    ],
       [  0.1385,   1.    ]])

1. **One question, I see your submission file contains a lot of negative large values like -440.xxxxx. Is this the final results you submitted with .922 auc?**

Ans:
Yes, as the scale makes no difference for AUROC. These are summed logOdds, so you should be able to get back to probabilities by dividing them by 200 and applying a logisitc function.

2. **Why did `pred += logit(model.predict_proba())` instead of `pred += log10(model.predict_proba())`**

Ans:
That's a valid question. We actually did += log for long time (which is the same as multiplication of probabilities). But logit does transformation of probabilities to odds first and it gives a slight boost over simple log.

In [19]:
for var in features:
    model = lgb.LGBMClassifier(**{
        'learning_rate':0.05, 
        'max_bin': 165, 
        'max_depth': 5, 
        'min_child_samples': 150,
        'min_child_weight': 0.1, 
        'min_split_gain': 0.0018, 
        'n_estimators': 41,
        'num_leaves': 6, 
        'reg_alpha': 2.0, 
        'reg_lambda': 2.54, 
        'objective': 'binary', 
        'n_jobs': -1})
    model = model.fit(np.hstack([train_df[var].values.reshape(-1,1),
                                 train_df[var].map(var_stats[var]).values.reshape(-1,1)]),
                               train_df["target"].values)
    pred += logit(model.predict_proba(np.hstack([test_df[var].values.reshape(-1,1),
                                 test_df[var].map(var_stats[var]).values.reshape(-1,1)]))[:,1])
    
pd.DataFrame({"ID_code":test_df["ID_code"], "target":pred}).to_csv("submission.csv", index=False)