In [1]:
%%time
import pandas as pd

from target_encoding import TargetEncoderClassifier, TargetEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression

Wall time: 1.91 s


In [2]:
%%time
train=pd.read_csv("Xx_train.csv")
test=pd.read_csv("Xx_test.csv")
sample_submission = pd.read_csv('sample_submission.csv')

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=100)

Wall time: 1.81 s


In [3]:
train.columns

Index(['impression_id', 'user_id', 'app_code', 'os_version', 'is_4G', 'time',
       'item_id', 'device_type', 'session_id', 'item_price', 'category_1',
       'category_2', 'category_3', 'product_type', 'is_click'],
      dtype='object')

In [4]:
%%time
len_uniques = []
for c in train.columns.drop(['is_click']):
    le = LabelEncoder()
    le.fit(pd.concat([train[c], test[c]])) 
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])
    print(c, len(le.classes_))
    len_uniques.append(len(le.classes_))
    
X = train.drop(['impression_id','is_click'], axis=1)
y = train['is_click']

impression_id 328284
user_id 89157
app_code 522
os_version 3
is_4G 2
time 6
item_id 18523
device_type 3
session_id 43470
item_price 4933
category_1 15
category_2 75
category_3 314
product_type 3961
Wall time: 3.34 s


In [5]:
%%time
ALPHA = 75
MAX_UNIQUE = max(len_uniques)
FEATURES_COUNT = X.shape[1]

Wall time: 0 ns


In [6]:
%%time
'''
alpha: float or int, smoothing for generalization.

max_unique: int, maximum number of unique values in a feature. 
            If there are more unique values inside the feature,
            then the algorithm will split this feature into bins, 
            the number of max_unique.

used_features: int, this is a number of used features for prediction
               The algorithm encodes all features with the average value of the target, 
               then the std is considered inside each feature,
               and "used_features" features with the highest std are selected to use only informative features. 
'''

enc = TargetEncoderClassifier(alpha=ALPHA, max_unique=MAX_UNIQUE, used_features=FEATURES_COUNT)
score = cross_val_score(enc, X, y, scoring='roc_auc', cv=cv)
print(f'score: {score.mean():.4}, std: {score.std():.4}')

enc.fit(X, y)
pred_enc = enc.predict_proba(test.drop('impression_id', axis=1))[:,1]

score: 0.6788, std: 0.008688
Wall time: 7min 46s


In [7]:
%%time
'''
split: list of int or cross-validator class,
            if split is [], then algorithm will encode features without cross-validation
            This situation features will overfit on target

            if split len is 1 for example [5], algorithm will encode features by using cross-validation on 5 folds
            This situation you will not overfit on tests, but when you will validate, your score will overfit

            if split len is 2 for example [5, 3], algorithm will separate data on 5 folds, afterwords
            will encode features by using cross-validation on 3 folds
            This situation is the best way to avoid overfit, but algorithm will use small data for encode.
'''


enc = TargetEncoder(alpha=ALPHA, max_unique=MAX_UNIQUE, split=[cv])
X_train = enc.transform_train(X=X, y=y)
X_test = enc.transform_test(test.drop('impression_id', axis=1))

Wall time: 8min 45s


In [8]:
%%time
lin = LogisticRegression()
score = cross_val_score(lin, X_train, y, scoring='roc_auc', cv=cv)
print(f'score: {score.mean():.4}, std: {score.std():.4}')


lin.fit(X_train, y)
pred_lin = lin.predict_proba(X_test)[:,1]



score: 0.7324, std: 0.006211




Wall time: 18.6 s


In [9]:
%%time
sample_submission['is_click'] = pred_enc + pred_lin
sample_submission.to_csv('new_submission.csv', index=False)

Wall time: 699 ms


In [10]:
%%time
sample_submission.head()

Wall time: 556 µs


Unnamed: 0,impression_id,is_click
0,a9e7126a585a69a32bc7414e9d0c0ada,0.089132
1,caac14a5bf2ba283db7708bb34855760,0.062905
2,13f10ba306a19ce7bec2f3cae507b698,0.11373
3,39c4b4dc0e9701b55a0a4f072008fb3f,0.064019
4,bf5a572cca75f5fc67f4b14e58b11d70,0.221912
