In [3]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

from util import gini_normalized, gini_lgbm, gini_xgb
from preprocessing import preproc

Import dataset and preprocess data

In [4]:
# Importing the train dataset
dataset_train = pd.read_csv('train.csv')

# Importing the test dataset
dataset_test = pd.read_csv('test.csv')

# Preprocessing both tests
X_train, y_train = preproc(dataset_train, mode='train', oneHot=False)
X_test, y_test = preproc(dataset_test, mode="test", oneHot=False)

replacing missing values
number of examples: 595212
replacing missing values
number of examples: 892816


In [5]:
# parameters
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'gini'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [6]:
#Kflod initialization
K = 5
kf = KFold(n_splits=K, random_state=42, shuffle=True)
#training with KFold Cross Validation
weights = np.zeros(len(y_train))
weights[y_train == 0] = 1
weights[y_train == 1] = 30

Training and prediction

In [13]:
i=0
results=[]
for train_index, test_index in kf.split(X_train):
    lgb_train = lgb.Dataset(X_train[train_index], y_train[train_index])
    lgb_eval = lgb.Dataset(X_train[test_index], y_train[test_index], reference=lgb_train)
    gbm = lgb.train(params,
        train_set=lgb_train,
        num_boost_round=200,
        valid_sets=lgb_eval,
        early_stopping_rounds=50,
        verbose_eval=5,
        feval=gini_lgbm)
    res = gbm.predict(X_test)
    i+=1
    results.append(res)

Training until validation scores don't improve for 50 rounds.
[5]	valid_0's gini: 0.241741
[10]	valid_0's gini: 0.241278
[15]	valid_0's gini: 0.245037
[20]	valid_0's gini: 0.247852
[25]	valid_0's gini: 0.250213
[30]	valid_0's gini: 0.251468
[35]	valid_0's gini: 0.253059
[40]	valid_0's gini: 0.253318
[45]	valid_0's gini: 0.253309
[50]	valid_0's gini: 0.254783
[55]	valid_0's gini: 0.257294
[60]	valid_0's gini: 0.260316
[65]	valid_0's gini: 0.261908
[70]	valid_0's gini: 0.264196
[75]	valid_0's gini: 0.265746
[80]	valid_0's gini: 0.267739
[85]	valid_0's gini: 0.269143
[90]	valid_0's gini: 0.270798
[95]	valid_0's gini: 0.272529
[100]	valid_0's gini: 0.273946
[105]	valid_0's gini: 0.275417
[110]	valid_0's gini: 0.275999
[115]	valid_0's gini: 0.277372
[120]	valid_0's gini: 0.278649
[125]	valid_0's gini: 0.278716
[130]	valid_0's gini: 0.278665
[135]	valid_0's gini: 0.27926
[140]	valid_0's gini: 0.280118
[145]	valid_0's gini: 0.280218
[150]	valid_0's gini: 0.280845
[155]	valid_0's gini: 0.28158

In [8]:
def to_csv(y_pred, ids):
    import csv
    with open('sumbission_5Kfold_lgbm.csv', 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        spamwriter.writerow(['id', 'target'])
        for i in range(len(y_pred)):
            spamwriter.writerow([ids[i], y_pred[i]])

In [14]:
submission = (results[0] + results[1] + results[2] + results[3] + results[4]) / 5
idx = dataset_test.iloc[:, 0].values
to_csv(submission,idx)

gini score for the submission : 0.27731