In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import time
import gc
import xgboost as xgb
from category_encoders import OneHotEncoder
from sklearn.metrics import f1_score, cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from subprocess import check_output
print(check_output(['ls', 'ds_data']).decode('utf-8'))
SEED = 13

data_test.csv
data_train.csv
readme



In [92]:
np.random.seed(SEED)

In [93]:
train = pd.read_csv("ds_data/data_train.csv")
test = pd.read_csv("ds_data/data_test.csv")

### composition of majority and minority class of target label of the orginal training set

In [94]:
train.target.value_counts()/train.shape[0]

0    0.963564
1    0.036436
Name: target, dtype: float64

### Down sampling Majority class to balance  train set 

In [95]:
train_majority = train[train['target'] == 0]
train_minority = train[train['target'] == 1]

train_majority_downsampled = resample(train_majority, replace=False, n_samples=30000, random_state=13)
train_downsampled = pd.concat([train_majority_downsampled, train_minority])
del train_majority,train_minority, train_majority_downsampled
gc.collect()

2039

In [96]:
### composistion of 

In [97]:
train_downsampled.target.value_counts() /train_downsampled.shape[0]

0    0.580091
1    0.419909
Name: target, dtype: float64

In [98]:
y = train_downsampled.target.values
X = train_downsampled

del train, train_downsampled; gc.collect()

0

In [99]:
train_id = X.id.values
y = X.target.values
X.drop(['id', 'target'], inplace=True, axis=1)
#print("train {0} and target {1}".format(train.shape, target.shape))

test_id = test.id.values
test.drop(['id'], inplace=True, axis=1)
#print("test {0} and id {1}".format(test.shape, test_id.shape))

In [100]:
#filling null with -1
null_cols =  [c for c in X.columns if X[c].isnull().sum() != 0]
for col in null_cols:
    X[col] = X[col].fillna(-1.0)
    test[col]  = test[col].fillna(-1.0)

In [101]:
#one hot encodeing columns with less than 5 categories
cat_with_less_5_nunique = [c for c in X.columns if X[c].nunique() < 5]
encoder = OneHotEncoder(return_df=True, drop_invariant=True, cols=cat_with_less_5_nunique)

X = encoder.fit_transform(X)
test = encoder.fit_transform(test)
print("train {0} and test {1}".format(X.shape, test.shape))

train (51716, 87) and test (892816, 87)


In [102]:
params = {
    'booster': 'gbtree',
    'learning_rate': 0.02,
    'max_depth': 4,
    'min_child_weight':10,
    'gamma':10,
    'subsample':0.8,
    'tree_method': 'hist',
    'colsample_bytree':0.7,
    'grow_ploicy':'lossguide',
    'objective':'binary:logistic',
    'eval_metric':'auc',   
    'silent': 1
}

In [103]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
X = X.values
test = test.values

dtest = xgb.DMatrix(data = test)

In [104]:
def f1(true, pred):
    pred = np.where(pred >= 0.5, 1, 0)
    return f1_score(true, pred)

def custom_f1(pred, dmatix):
    true = dmatix.get_label()
    pred = np.where(pred >= 0.5, 1, 0)
    return 'f1', f1_score(true, pred)

###  Baggaing rounds 4  and 5 fold cross validation

In [105]:
x_score = []
final_cv_train = np.zeros(len(y))
final_cv_pred = np.zeros(len(test_id))
for s in range(4):
    cv_train = np.zeros(len(y))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = SEED

    
    kf = kfold.split(X, y)

    best_trees = []
    fold_scores = []

    for i, (train_fold, valid) in enumerate(kf):
        X_tr, X_val, y_tr, y_val = X[train_fold, :], X[valid, :], y[train_fold], y[valid]
            
        dtrain = xgb.DMatrix(data = X_tr, label= y_tr)
        dvalid = xgb.DMatrix(data = X_val, label=y_val)        
        bst = xgb.train(xgb_params, dtrain=dtrain, num_boost_round=1000,
                        evals = [(dtrain, 'train'), (dvalid, 'valid')],  maximize = True, verbose_eval = 100,
                        early_stopping_rounds = 50)             
        
        cv_pred += bst.predict(dtest)
        cv_train[valid] += bst.predict(dvalid)

        score = f1(y_val, cv_train[valid])
        print("f1_score: {}".format(score))
        fold_scores.append(score)

    cv_pred /= NFOLDS
    final_cv_train += cv_train
    final_cv_pred += cv_pred

    print("cv score:")
    print(f1(y, cv_train))
    print("current f1_score: {0}, bagging round: {1}".format(f1(y, final_cv_train / (s + 1.)), s+1))
    x_score.append(f1(y, cv_train))


[18:38:03] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.603749	valid-auc:0.592493
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[100]	train-auc:0.643596	valid-auc:0.623488
[200]	train-auc:0.650625	valid-auc:0.62985
[300]	train-auc:0.654512	valid-auc:0.632314
[400]	train-auc:0.656982	valid-auc:0.633545
[500]	train-auc:0.659046	valid-auc:0.634122
[600]	train-auc:0.660451	valid-auc:0.634607
[700]	train-auc:0.661799	valid-auc:0.63489
[800]	train-auc:0.663261	valid-auc:0.635279
Stopping. Best iteration:
[847]	train-auc:0.663899	valid-auc:0.63542

f1_score: 0.4090774250964106
[18:38:14] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.59896	valid-auc:0.601211
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved 

[100]	train-auc:0.640377	valid-auc:0.63973
[200]	train-auc:0.648008	valid-auc:0.644692
[300]	train-auc:0.65224	valid-auc:0.646226
[400]	train-auc:0.654581	valid-auc:0.646606
[500]	train-auc:0.656503	valid-auc:0.647103
Stopping. Best iteration:
[507]	train-auc:0.656616	valid-auc:0.647112

f1_score: 0.421209858103062
[18:39:56] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.598275	valid-auc:0.590127
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[100]	train-auc:0.641424	valid-auc:0.63264
[200]	train-auc:0.648584	valid-auc:0.637266
[300]	train-auc:0.652204	valid-auc:0.639603
[400]	train-auc:0.655032	valid-auc:0.640407
[500]	train-auc:0.657462	valid-auc:0.640706
[600]	train-auc:0.659236	valid-auc:0.641031
[700]	train-auc:0.660834	valid-auc:0.641174
[800]	train-auc:0.662628	valid-auc:0.641333
[900]	train-auc:0.663873	valid-auc:0.64149
[999]	

In [106]:
print(x_score)
test_pred = final_cv_pred / 4.
test_pred = np.where(test_pred > 0.5, 1, 0)
sub = pd.DataFrame({'id': test_id, 'target': test_pred})
sub.head(5)

[0.4204619292338591, 0.4204619292338591, 0.4204619292338591, 0.4204619292338591]


Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [107]:
sub.to_csv('submission/xgb_pred_avg.csv', index=False)