In [2]:
import pandas as pd
import numpy as np
import pandas_profiling as pd_prof
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [3]:
train = pd.read_csv("train_1.csv", sep="\t", index_col="Unnamed: 0")
test = pd.read_csv("test_1.csv", sep="\t", index_col="Unnamed: 0")

In [4]:
rejected_variables = ['140', '152', '160', '164', '9', '153']

In [5]:
train = train.drop(rejected_variables, axis=1, errors='ignore')
train.shape

(30500, 340)

In [6]:
x_train = train.iloc[:, 1:]
y_train = train.iloc[:, 0]
x_train.shape, y_train.shape

((30500, 339), (30500,))

In [7]:
x_train['1'].value_counts(), y_train.value_counts()

(1    29826
 0      674
 Name: 1, dtype: int64, 0    25077
 1     5423
 Name: 0, dtype: int64)

In [8]:
x_test = test.drop(rejected_variables, axis=1, errors='ignore').drop('0', axis=1)
x_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,10,11,...,336,337,338,339,340,341,342,343,344,345
0,1,0,0,1,0,0,0.136364,0,1,1,...,0.192984,0,1,0,0,0.222222,1,1,1,1
1,1,0,0,1,0,0,0.181818,0,1,1,...,0.19569,0,1,0,0,0.0,1,1,1,0
2,1,0,0,0,0,0,0.090909,0,1,1,...,0.192984,0,1,0,0,0.222222,1,1,1,0
3,1,0,0,1,0,0,0.090909,0,1,1,...,0.19569,0,1,0,0,0.0,1,1,1,0
4,1,0,0,1,0,0,0.090909,0,1,1,...,0.289893,0,0,1,0,0.0,1,1,1,1


In [43]:
def lgb_score(res):
    return max(res['auc-mean']), len(res['auc-mean']) 

In [9]:
import lightgbm as lgb

In [11]:
n_rounds = 10000
kf = StratifiedKFold(n_splits=7, random_state=42)
lgb_train = lgb.Dataset(x_train, label=y_train, free_raw_data=False)

In [52]:
# test 1
lgb_param = {
    'objective': 'binary',
    'num_threads': 4,
    "metric": 'auc',
    'learning_rate': 0.001,
    'n_estimators': 10000,
    'num_leaves': 50,
    
    #regularization
    'colsample_bytree': 0.6,
    'subsample': 0.8,
    'subsample_freq': 1,
    'min_data_in_leaf': 50,
    
}

result = lgb.cv(lgb_param, lgb_train, n_rounds, folds=kf.split(x_train, y_train), early_stopping_rounds=170, verbose_eval=0,  )

if 'lgb_score' in vars():
    print("prev score: "+str(lgb_score))
lgb_score = max(result['auc-mean']) 
print(lgb_score, len(result['auc-mean']))




prev score: <function lgb_score at 0x7f99e32308c8>
0.7474241865527684 7156


In [38]:
model = lgb.train(lgb_param, lgb_train, num_boost_round=8157)

In [39]:
pred = model.predict(x_test)
print((pred))

[0.13996113 0.54306269 0.25761918 ... 0.08305767 0.10558701 0.10239873]


In [42]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train, model.predict(x_train)), 0.76244713


(0.9010981710170036, 0.76244713)

In [41]:
pred_df= pd.DataFrame({'_ID_': x_test.index, '_VAL_': pred})
pred_df.to_csv('submit_1.csv', index=False)

In [48]:
# test 2
lgb_param2 = {
    'objective': 'binary',
    'max_bin': 63,
    "metric": 'auc',
    'learning_rate': 0.01,
    'num_leaves': 50,
    
    #regularization
    'colsample_bytree': 0.6,
    'subsample': 0.8,
    'subsample_freq': 1,
    'min_data_in_leaf': 50,
    
}



In [49]:
%%time
#cpu


result = lgb.cv(lgb_param2, lgb_train, n_rounds, folds=kf.split(x_train, y_train), early_stopping_rounds=10, verbose_eval=0, )

if 'lgb_score2' in vars():
    print("prev score: "+str(lgb_score2))
lgb_score2 = max(result['auc-mean']) 
print(lgb_score2, len(result['auc-mean']))

prev score: 0.7462675649862666
0.7457195553557747 542
CPU times: user 2min 45s, sys: 676 ms, total: 2min 45s
Wall time: 41.5 s


In [None]:
#stacking
stack_param1 = {
    'objective': 'binary',
    'num_threads': 4,
    "metric": 'auc',
    'learning_rate': 0.001,
    'n_estimators': 10000,
    'num_leaves': 50,
    
    #regularization
    'colsample_bytree': 0.6,
    'subsample': 0.8,
    'subsample_freq': 1,
    'min_data_in_leaf': 50,
    
}
stack_num_round1 = 7438



In [37]:
def build_stack(xtrain, ytrain, xtest, param1, num_round1):
    i = list(kf.split(xtrain, ytrain))
    
    new_train_feature = np.zeros(xtrain.shape[0])
    new_test_feature = np.zeros(xtest.shape[0])
    
    lgb_ds = lgb.Dataset(xtrain, label=ytrain, free_raw_data=False)

    for (i1, (sub_train, sub_test)) in enumerate(i):
        model = lgb.train(param1, lgb_ds.subset(sub_train), num_boost_round=num_round1)
        pred = model.predict(xtrain.loc[sub_test])
        new_train_feature[sub_test] =  pred
    
    ## adding new feature to test
    model2 = lgb.train(param1, lgb_ds, num_boost_round=num_round1)
    pred2 = model2.predict(xtest)
    new_test_feature[:]= pred2
    return (pd.concat([xtrain, pd.DataFrame(new_train_feature)], axis=1), pd.concat([xtest, pd.DataFrame(new_test_feature)], axis=1))


In [53]:
%%time
new_x_train, new_x_test = build_stack(x_train, y_train, x_test, lgb_param, 7156)

CPU times: user 40min 52s, sys: 11.5 s, total: 41min 4s
Wall time: 10min 18s


In [106]:
param = {
    'objective': 'binary',
    'num_threads': 4,
    "metric": 'auc',
    'learning_rate': 0.001,

    'num_leaves': 40,
    'max_bin' : 120,
    #regularization
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'subsample_freq': 1,
    'min_data_in_leaf': 40,
    
}
result = lgb.cv(param, lgb.Dataset(new_x_train, y_train, free_raw_data=False), n_rounds, folds=kf.split(new_x_train, y_train), early_stopping_rounds=300, verbose_eval=-2, )


In [107]:
max(result['auc-mean']), len(result['auc-mean']) 

(0.7443102843442028, 70)

In [65]:
param = {
    'objective': 'binary',
    'num_threads': 4,
    "metric": 'auc',
    'learning_rate': 0.001,
    
    'num_leaves': 50,
    
    #regularization
    'colsample_bytree': [0.6, 0.8],
    'subsample': 0.8,
    'subsample_freq': 1,
    'min_data_in_leaf': 50,
    
}

def lightgbm_search(params, ds):
    res = []
    for k in params:
        for v in params:
            print("{}: {}".format(k, v))