In [57]:
import lightgbm as lgb
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, cohen_kappa_score, confusion_matrix
from subprocess import check_output
print(check_output(['ls', 'ds_data']).decode('utf-8'))

data_test.csv
data_train.csv
readme



In [34]:
train = pd.read_csv("ds_data/data_train.csv")
test = pd.read_csv("ds_data/data_test.csv")

In [35]:
train_id = train.id.values
y_train = train.target.values
train.drop(['id', 'target'], inplace=True, axis=1)
#print("train {0} and target {1}".format(train.shape, target.shape))

test_id = test.id.values
test.drop(['id'], inplace=True, axis=1)
#print("test {0} and id {1}".format(test.shape, test_id.shape))

In [36]:
null_cols =  [c for c in train.columns if train[c].isnull().sum() != 0]
for col in null_cols:
    train[col] = train[col].fillna(-1.0)   
    test[col]  = test[col].fillna(-1.0)

In [37]:
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)

In [38]:
cat_features = [c for c in train.columns if train[c].nunique() < 10]
cat_features.remove('missing')

In [39]:
for c in cat_features:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])     
    test[c] = le.transform(test[c])

In [40]:
enc = OneHotEncoder(categories='auto')
enc.fit(train[cat_features])
train_cat = enc.transform(train[cat_features])
test_cat = enc.transform(test[cat_features])

In [41]:
num_features = [c for c in train.columns if train[c].nunique() >= 10]
len(num_features)

24

In [42]:
cnt = 0
for c in num_features:
    if cnt == 0:
        train['new_num'] = train[c].astype(str)+'_'        
        test['new_num'] = test[c].astype(str)+'_'
        cnt += 1
    else:
        train['new_num'] = train[c].astype(str)+'_'        
        test['new_num'] += test[c].astype(str)+'_'


In [43]:
cat_cnt_features = []
for c in cat_features +['new_num']:
    d = pd.concat([train[c], test[c]]).value_counts().to_dict()
    train['%s_cnt'%c] = train[c].apply(lambda x : d.get(x,0))    
    test['%s_cnt'%c] = test[c].apply(lambda x : d.get(x,0))
    cat_cnt_features.append('%s_cnt'%c)

In [44]:
train_list = [train[num_features + cat_cnt_features].values, train_cat]
test_list = [test[num_features + cat_cnt_features].values, test_cat]

In [45]:
X = ssp.hstack(train_list).tocsr()
org_test =  ssp.hstack(test_list).tocsr()

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y_train, test_size = 0.4, random_state=13)

In [47]:
print(X.shape, X_test.shape, org_test.shape)

(596000, 167) (238400, 167) (892816, 167)


In [48]:
lgb_params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "learning_rate": 0.04,
    "num_leaves": 15,
    "max_bin": 256,
    "feature_fraction": 0.7,
    "verbosity": 0,
    "drop_rate": 0.1,
    "is_unbalance": False,
    "max_drop": 50,
    "min_child_samples": 10,
    "min_child_weight": 150,
    "min_split_gain": 0,
    "subsample": 0.9,
    "metric": "auc"    
     }

In [49]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state=13)

In [51]:
dtrain = lgb.Dataset(data=X_tr, label=y_tr)
dvalid = lgb.Dataset(data=X_val, label=y_val)

In [52]:
model = lgb.train(params=lgb_params,
                  train_set=dtrain,
                  num_boost_round=1000,
                  valid_sets= [dtrain, dvalid],
                  valid_names = ["train", "valid"],
                  verbose_eval=20,
                  early_stopping_rounds=50
                 )

Training until validation scores don't improve for 50 rounds.
[20]	train's auc: 0.625847	valid's auc: 0.61454
[40]	train's auc: 0.628792	valid's auc: 0.618417
[60]	train's auc: 0.632133	valid's auc: 0.620825
[80]	train's auc: 0.637778	valid's auc: 0.626601
[100]	train's auc: 0.643418	valid's auc: 0.630594
[120]	train's auc: 0.648778	valid's auc: 0.633076
[140]	train's auc: 0.653245	valid's auc: 0.635529
[160]	train's auc: 0.657206	valid's auc: 0.637076
[180]	train's auc: 0.660755	valid's auc: 0.637964
[200]	train's auc: 0.664344	valid's auc: 0.638873
[220]	train's auc: 0.667453	valid's auc: 0.639072
[240]	train's auc: 0.670626	valid's auc: 0.639598
[260]	train's auc: 0.673547	valid's auc: 0.639982
[280]	train's auc: 0.676468	valid's auc: 0.640019
[300]	train's auc: 0.679215	valid's auc: 0.640215
[320]	train's auc: 0.681869	valid's auc: 0.640124
[340]	train's auc: 0.684165	valid's auc: 0.639987
Early stopping, best iteration is:
[307]	train's auc: 0.680148	valid's auc: 0.640309


In [54]:
test_pred = model.predict(X_test)

In [55]:
test_pred = np.where(test_pred >= 0.5, 1, 0)

In [58]:
confusion_matrix(y_test, test_pred)

array([[229649,      0],
       [  8751,      0]])