In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import gc

In [2]:
train_df = pd.read_csv('../input/train.csv') 
test_df = pd.read_csv('../input/test.csv') 
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df.pop('target')
train_df.drop('ID_code',axis=1,inplace=True)
test_df.drop('ID_code',axis=1,inplace=True)

In [3]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1}

In [4]:
test_df['target'] = -1
train_df['i_am_train'] = 1
test_df['i_am_train'] = 0

full_df = pd.concat([train_df, test_df], axis=0)

In [10]:
full_df['var_20_counts'] = full_df['var_20'].map(full_df['var_20'].value_counts().to_dict())
full_df['var_155_counts'] = full_df['var_155'].map(full_df['var_155'].value_counts().to_dict())

full_df['var_198_counts'] = full_df['var_198'].map(full_df['var_198'].value_counts().to_dict())
full_df['var_191_counts'] = full_df['var_191'].map(full_df['var_191'].value_counts().to_dict())

full_df['var_177_counts'] = full_df['var_177'].map(full_df['var_177'].value_counts().to_dict())
full_df['var_88_counts'] = full_df['var_88'].map(full_df['var_88'].value_counts().to_dict())

full_df['var_116_counts'] = full_df['var_116'].map(full_df['var_116'].value_counts().to_dict())
full_df['var_4_counts'] = full_df['var_4'].map(full_df['var_4'].value_counts().to_dict())

In [11]:
full_df['var20_155_countdiff'] = full_df['var_20_counts'] - full_df['var_155_counts']
full_df['var198_191_countdiff'] = full_df['var_198_counts'] - full_df['var_191_counts']
full_df['var177_88_countdiff'] = full_df['var_177_counts'] - full_df['var_88_counts']
full_df['var116_4_countdiff'] = full_df['var_116_counts'] - full_df['var_4_counts']

In [12]:
train_df = full_df.loc[full_df['i_am_train']==1]
test_df = full_df.loc[full_df['i_am_train']==0]

del train_df['i_am_train'], test_df['i_am_train'], test_df['target'], full_df

In [13]:
# random_state= 44000
num_folds = 5
folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))

In [14]:
train_df.head()

Unnamed: 0,target,var_0,var_1,var_10,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_11,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_12,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_13,var_130,var_131,var_132,...,var_74,var_75,var_76,var_77,var_78,var_79,var_8,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_9,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,var_98,var_99,var_20_counts,var_155_counts,var_198_counts,var_191_counts,var_177_counts,var_88_counts,var_116_counts,var_4_counts,var20_155_countdiff,var198_191_countdiff,var177_88_countdiff,var116_4_countdiff
0,,8.9255,-6.7863,2.9252,9.4763,13.3102,26.5376,1.4403,14.71,6.0454,9.5426,17.1554,14.1104,24.3627,3.1821,2.0323,6.7602,3.9141,-0.4851,2.524,1.5093,2.5516,15.5752,-13.4221,7.2739,14.0137,16.0094,9.7268,0.8897,0.7754,4.2218,12.0039,13.8571,-0.7338,-1.9245,15.4462,0.5745,12.8287,0.3587,9.6508,...,43.1127,18.3816,-2.344,23.4104,6.5199,12.1983,-4.92,13.6468,13.8372,1.3675,2.9423,-4.5213,21.4669,9.3225,16.4597,7.9984,-1.7069,5.747,-21.4494,6.7806,11.0924,9.9913,14.8421,0.1812,8.9642,16.2572,2.1743,-3.4132,4,1,5,12,2,4,8,5,3,-7,-2,3
1,,11.5006,-4.1473,-0.4032,-13.695,8.4068,35.4734,1.7093,15.1866,2.6227,7.3412,32.0888,13.955,13.0858,8.0585,6.6203,7.1051,5.3523,8.5426,3.6159,4.1569,3.0454,7.8522,-11.51,7.5109,14.0239,31.5899,9.5018,8.2736,10.1633,0.1225,12.5942,14.5697,2.4354,0.8194,16.5346,8.4135,12.4205,-0.178,5.7582,...,7.7841,7.0529,3.2709,23.4822,5.5075,13.7814,3.1468,2.5462,18.1782,0.3683,-4.821,-5.485,13.7867,-13.5901,11.0993,7.9022,12.2301,8.0851,0.4768,6.8852,8.0905,10.9631,11.7569,-1.2722,24.7876,26.6881,1.8944,0.6939,7,3,10,6,10,8,5,3,4,4,2,2
2,,8.6093,-2.7457,-0.3249,-0.3939,12.6317,14.8863,1.3854,15.0284,3.9995,5.3683,8.6273,14.1963,20.3882,-11.2648,3.2304,5.7033,4.5255,2.1929,3.129,2.9044,1.1696,28.7632,-17.2738,2.1056,14.1929,21.1613,8.9573,2.7768,-2.1746,3.6932,12.4653,14.1978,-2.5511,-0.9479,17.1092,7.3124,11.5419,0.0975,8.8186,...,19.7312,19.4465,4.5048,23.2378,6.3191,12.8046,-4.9193,7.4729,15.7811,13.3529,10.1852,5.4604,19.0773,-4.4577,9.5413,11.9052,2.1447,5.9525,-22.4038,7.0883,14.1613,10.508,14.2621,0.2647,20.4031,17.036,1.6981,-0.0269,2,9,6,6,2,8,9,7,-7,0,-6,2
3,,11.0604,-2.1518,2.3061,-19.8592,22.5316,18.6129,1.3512,9.3291,4.2835,10.3907,7.0874,14.3256,14.4135,2.8102,4.2827,6.975,1.648,11.6896,2.5762,-2.5459,5.3446,38.1015,3.5732,5.0988,13.8463,30.5644,11.3025,3.9618,-8.2464,2.7038,12.3441,12.5431,-1.3683,3.5974,13.9761,11.9704,14.3003,1.0486,8.95,...,4.529,15.4235,11.6875,23.6273,4.0806,15.2733,-5.8609,0.7839,10.5404,1.6212,-5.2896,1.6027,17.9762,-2.3174,15.6298,4.5474,7.5509,8.245,-7.5866,7.0364,14.4027,10.7795,7.2887,-1.093,11.3596,18.1486,2.8344,1.948,4,1,2,8,5,3,1,8,3,-6,2,-7
4,,9.8369,-1.4834,-9.4458,-22.9264,12.3562,17.341,1.694,7.1179,5.1934,8.823,10.6617,14.0837,28.2749,-12.1419,-0.1937,5.9654,1.0719,7.9923,2.9138,-3.6135,1.4684,25.6795,13.8224,4.7478,13.8481,41.1037,12.714,5.2964,9.7289,3.937,12.1316,12.5815,7.0642,5.6518,10.9346,7.8895,11.4266,0.9442,7.7532,...,-4.5346,23.3521,1.0273,19.16,7.1734,14.3937,6.2654,2.9598,13.3317,-9.2587,-6.7075,7.8984,14.5265,7.0799,20.167,8.0053,3.7954,7.6784,-39.7997,7.0065,9.3627,10.4316,14.0553,0.0213,14.7246,35.2988,1.6844,0.6715,6,5,2,12,1,6,11,16,1,-10,-5,-5


In [15]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]

In [None]:
print('Training the Model:')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold idx:{}".format(fold_ + 1))
    to_appends = []
    for i in range(9):
        np.random.seed(i)
        to_append_train = train_df.iloc[trn_idx].loc[target==1].copy().apply(np.random.permutation) # Shuffle each column
        to_appends.append(to_append_train)
    full_append = pd.concat(to_appends,axis=0)
    full_append['target'] = 1
    
    trn_data = lgb.Dataset(pd.concat([train_df.iloc[trn_idx][features],full_append[features]], axis=0), label=pd.concat([target.iloc[trn_idx],full_append['target']],axis=0))
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Training the Model:
Fold idx:1
Training until validation scores don't improve for 4000 rounds.
[100]	training's auc: 0.873249	valid_1's auc: 0.852793
[200]	training's auc: 0.886433	valid_1's auc: 0.870909
[300]	training's auc: 0.891173	valid_1's auc: 0.875708
[400]	training's auc: 0.893846	valid_1's auc: 0.879447
[500]	training's auc: 0.89516	valid_1's auc: 0.881403
[600]	training's auc: 0.896854	valid_1's auc: 0.884131
[700]	training's auc: 0.897875	valid_1's auc: 0.885244
[800]	training's auc: 0.898523	valid_1's auc: 0.886311
[900]	training's auc: 0.899364	valid_1's auc: 0.887019
[1000]	training's auc: 0.900005	valid_1's auc: 0.887602
[1100]	training's auc: 0.900411	valid_1's auc: 0.888006
[1200]	training's auc: 0.900716	valid_1's auc: 0.888292
[1300]	training's auc: 0.901111	valid_1's auc: 0.888773
[1400]	training's auc: 0.901529	valid_1's auc: 0.889186
[1500]	training's auc: 0.902111	valid_1's auc: 0.889716
[1600]	training's auc: 0.902491	valid_1's auc: 0.890124
[1700]	training's a

In [None]:
train_df = pd.read_csv('../input/train.csv', usecols=['ID_code','target']) 
test_df = pd.read_csv('../input/test.csv', usecols=['ID_code','var_0']) 

In [None]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = predictions
sub.to_csv('submission_upsampled.csv', index=False)

In [None]:
oofs = pd.DataFrame({"ID_code": train_df.ID_code.values})
oofs["target"] = oof
oofs.to_csv('oof_upsampled.csv', index=False)