In [21]:
import gc
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

In [22]:
def read_data(nrows=None):
    train_df = pd.read_csv('train.csv', nrows=nrows)
    test_df = pd.read_csv('test.csv')
    return train_df, test_df

In [23]:
def process_data(train_df, test_df):
    idx = [c for c in train_df.columns if c not in ['ID_code','target']]
    for df in [test_df, train_df]:
        for feat in idx:
            df['r2_'+feat] = np.round(df[feat], 2)
            df['r2_'+feat] = np.round(df[feat], 2)
        df['sum'] = df[idx].sum(axis=1)
        df['min'] = df[idx].min(axis=1)
        df['max'] = df[idx].max(axis=1)
        df['mean'] = df[idx].mean(axis=1)
        df['std'] = df[idx].std(axis=1)
        df['skew'] = df[idx].std(axis=1)
        df['kurt'] = df[idx].kurtosis(axis=1)
        df['mes'] = df[idx].median(axis=1)
        variables = ['var_0', 'var_1', 'var_2', 'var_13', 'var_21','var_22','var_26','var_40','var_80','var_81','var_75','var_76','var_78','var_110','var_165']
        for i in variables:
            df[('-').join([i,i])] = df[i]*df[i]
    print('Train and test shape : ', train_df.shape, test_df.shape)
    return train_df, test_df

In [24]:
def run_model(train_df, test_df):
    features = [c for c in train_df.columns if c not in ['ID_code','target']]
    target = train_df['target']
    param = {
        'bagging_freq': 5,
        'bagging_fraction': 0.38,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.045,
        'learning_rate': 0.0095,
        'max_depth': -1,  
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 13,
        'num_threads': 4,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1
    }
    num_rounds = 1000000
    folds = StratifiedKFold(n_splits=12, shuffle=True, random_state=44000)
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        print('Fold : ', fold_)
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
        clf = lgb.train(param, trn_data, num_rounds, valid_sets = [trn_data, val_data],
                       verbose_eval=1000, early_stopping_rounds = 3500)
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    print("CV score : ", roc_auc_score(target, oof))
    return predictions

In [25]:
def submit(test_df, predictions):
    sub = pd.DataFrame({'ID_code':test_df.ID_code.values})
    sub['target'] = predictions
    sub.to_csv('submission_lgb_v2.csv', index=False)

In [26]:
def main(nrows=None):
    train_df, test_df = read_data(nrows)
    train_df, test_df = process_data(train_df, test_df)
    predictions = run_model(train_df,test_df)
    submit(test_df, predictions)

In [27]:
if __name__ == "__main__":
    main()

Train and test shape :  (200000, 425) (200000, 424)
Fold :  0
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.885657	valid_1's auc: 0.881037
[2000]	training's auc: 0.905821	valid_1's auc: 0.895637
[3000]	training's auc: 0.916992	valid_1's auc: 0.902211
[4000]	training's auc: 0.924246	valid_1's auc: 0.905493


KeyboardInterrupt: 