In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

#### Importing Data

In [3]:
train = pd.read_csv('train.csv')
test =pd.read_csv('test.csv')

#### Smoothing
 Smoothing is computed like in the following paper by Daniele Micci-Barreca
https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    
- trn_series : training categorical feature as a pd.Series
- tst_series : test categorical feature as a pd.Series
- target : target data as a pd.Series
- min_samples_leaf (int) : minimum samples to take category average into account
- smoothing (int) : smoothing effect to balance categorical average vs prior

In [4]:
MAX_ROUNDS = 500
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.1
EARLY_STOPPIN_ROUNDS = 50

In [5]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

In [6]:
def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):

    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

#### Pre-processing the data

In [7]:
train.fillna(-999, inplace=True)
test.fillna(-999, inplace=True)

In [8]:
cols = list(train.columns)
cols.remove('target')
cols.remove('transaction_id')

In [9]:
# Label Encoding
from tqdm import tqdm

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)
        
        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

100%|██████████| 49/49 [00:49<00:00,  1.01s/it]


In [10]:
y = train.target
id_train = train.transaction_id.values
id_test = test.transaction_id.values

In [11]:
X = train[cols]
test_df = test[cols]

In [12]:
f_cats = [f for f in X.columns if "cat_" in f]

In [13]:
y_valid_pred = 0*y
y_test_pred = 0

#### Cross-Validation

In [14]:
# Set up folds
K = 5
kf = KFold(n_splits=K, random_state = 47, shuffle=True)
np.random.seed(0)

In [15]:
# Set up classifier
from sklearn.metrics import roc_auc_score
model = XGBClassifier(
    n_estimators=MAX_ROUNDS,
    max_depth = 20,
    objective = 'binary:logistic',
    learning_rate = LEARNING_RATE,
    subsample = 0.8,
    min_child_weight = 4,
    colsample_bytree = 0.8,
    scale_pos_weight = 1.6,
    gamma = 10,
    reg_alpha = 8,
    reg_lambda = 1.3
)

In [16]:
# Run CV

for i, (train_index, test_index) in enumerate(kf.split(train)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index, :].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    
    print("\nFold ", i)
    
    # Encode data
    for f in f_cats:
        X_train[f+"_avg"], X_valid[f+"_avg"], X_test[f + "_avg"] = target_encode(trn_series=X_train[f],
                                                                                val_series=X_valid[f],
                                                                                tst_series=X_test[f],
                                                                                target=y_train,
                                                                                min_samples_leaf=200,
                                                                                smoothing=10,
                                                                                noise_level=0)
        # Run model for this fold
    if OPTIMIZE_ROUNDS:
        eval_set = [(X_train, y_train), (X_valid, y_valid)]
        fit_model = model.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', early_stopping_rounds=EARLY_STOPPIN_ROUNDS, verbose=True)
        print("Best N trees = ", model.best_ntree_limit)
        print("Best Auc = ", model.best_score)
    else:
        fit_model = model.fit(X_train, y_train)
        
        
        # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print('Auc = ', roc_auc_score(y_valid, pred))
    y_valid_pred.iloc[test_index] = pred
        
        # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
        
    del X_test, X_train, X_valid, y_train
        
y_test_pred /=K # Average test set predictions
print('\nAuc for full training set:', roc_auc_score(y, y_valid_pred))



Fold  0
Auc =  0.732277401804

Fold  1
Auc =  0.725302373161

Fold  2
Auc =  0.725513984425

Fold  3
Auc =  0.725529218529

Fold  4
Auc =  0.731359485701

Auc for full training set: 0.727915414554


1. Auc for full training set: 0.726012541651 max_depth = 8, MAX_ROUNDS = 400, min_child_weight = 6
2. Auc for full training set: 0.727038362868 max_depth = 12, MAX_ROUNDS = 300, min_child_weight = 4

#### Creating submission file

In [17]:
subm = pd.DataFrame()
subm['transaction_id'] = id_test
subm['target'] = y_test_pred

In [18]:
#### Saving File
subm.to_csv('submit.csv', index=False)