In [35]:
#system
from tqdm import tqdm
from tqdm import tqdm_notebook

#base
import numpy as np
import pandas as pd

#model
import xgboost as xbg
import lightgbm as lgb
import catboost as cbt

#sklearn tools
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.preprocessing import LabelEncoder

#others



import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
train_set = pd.read_csv('input/train_set.csv')
test_set = pd.read_csv('input/test_set.csv')

In [37]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25317 entries, 0 to 25316
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         25317 non-null  int64 
 1   age        25317 non-null  int64 
 2   job        25317 non-null  object
 3   marital    25317 non-null  object
 4   education  25317 non-null  object
 5   default    25317 non-null  object
 6   balance    25317 non-null  int64 
 7   housing    25317 non-null  object
 8   loan       25317 non-null  object
 9   contact    25317 non-null  object
 10  day        25317 non-null  int64 
 11  month      25317 non-null  object
 12  duration   25317 non-null  int64 
 13  campaign   25317 non-null  int64 
 14  pdays      25317 non-null  int64 
 15  previous   25317 non-null  int64 
 16  poutcome   25317 non-null  object
 17  y          25317 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 3.5+ MB


In [58]:
test_set.info()
test_set['y'] = -1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10852 entries, 0 to 10851
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         10852 non-null  int64 
 1   age        10852 non-null  int64 
 2   job        10852 non-null  object
 3   marital    10852 non-null  object
 4   education  10852 non-null  object
 5   default    10852 non-null  object
 6   balance    10852 non-null  int64 
 7   housing    10852 non-null  object
 8   loan       10852 non-null  object
 9   contact    10852 non-null  object
 10  day        10852 non-null  int64 
 11  month      10852 non-null  object
 12  duration   10852 non-null  int64 
 13  campaign   10852 non-null  int64 
 14  pdays      10852 non-null  int64 
 15  previous   10852 non-null  int64 
 16  poutcome   10852 non-null  object
dtypes: int64(8), object(9)
memory usage: 1.4+ MB


In [59]:
train_set.corr()

Unnamed: 0,ID,age,balance,day,duration,campaign,pdays,previous,y
ID,1.0,0.008465,0.032719,-0.020171,0.229149,-0.038265,0.065807,0.047028,0.556627
age,0.008465,1.0,0.09374,-0.01607,0.000416,0.006171,-0.026431,0.006575,0.029916
balance,0.032719,0.09374,1.0,0.010245,0.026042,-0.010419,0.001032,0.015792,0.057564
day,-0.020171,-0.01607,0.010245,1.0,-0.031946,0.16883,-0.092892,-0.050706,-0.031886
duration,0.229149,0.000416,0.026042,-0.031946,1.0,-0.08778,4e-05,0.001315,0.394746
campaign,-0.038265,0.006171,-0.010419,0.16883,-0.08778,1.0,-0.089224,-0.031667,-0.075173
pdays,0.065807,-0.026431,0.001032,-0.092892,4e-05,-0.089224,1.0,0.411688,0.107565
previous,0.047028,0.006575,0.015792,-0.050706,0.001315,-0.031667,0.411688,1.0,0.088337
y,0.556627,0.029916,0.057564,-0.031886,0.394746,-0.075173,0.107565,0.088337,1.0


In [60]:
train_set.job.value_counts()

blue-collar      5456
management       5296
technician       4241
admin.           2909
services         2342
retired          1273
self-employed     884
entrepreneur      856
unemployed        701
housemaid         663
student           533
unknown           163
Name: job, dtype: int64

In [62]:
data = train_set.append(train_set).reset_index(drop=True)
data.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown,0
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other,0
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown,0
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown,0
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown,0


In [63]:
cat_col = [i for i in data.select_dtypes(object).columns if i not in ['ID', 'y']]
for i in tqdm_notebook(cat_col):
    data['count_'+i] = data.groupby(i)[i].transform('count')
    lbl = LabelEncoder()
    data[i] = lbl.fit_transform(data[i].astype(str))

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [64]:
data.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,...,y,count_job,count_marital,count_education,count_default,count_housing,count_loan,count_contact,count_month,count_poutcome
0,1,43,4,1,2,0,291,1,0,2,...,0,10592,30490,14894,49738,28040,42516,14562,15310,41354
1,2,42,9,0,0,0,5076,1,0,0,...,0,8482,5830,7696,49738,28040,42516,32782,3338,2140
2,3,47,0,1,1,0,104,1,1,0,...,0,5818,30490,25914,49738,28040,8118,32782,7874,41354
3,4,28,4,2,1,0,-994,1,1,0,...,0,10592,14314,25914,49738,28040,8118,32782,7874,41354
4,5,42,9,0,1,0,2974,1,0,2,...,0,8482,5830,25914,49738,28040,42516,14562,15310,41354


In [65]:
feats = [i  for i in data.columns if i not in ['ID', 'y']]
feats

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'count_job',
 'count_marital',
 'count_education',
 'count_default',
 'count_housing',
 'count_loan',
 'count_contact',
 'count_month',
 'count_poutcome']

In [66]:
model = lgb.LGBMClassifier(
    boosting_type='GBDT',
    num_leaves=30,
    n_estimators=1500,
    max_depth=-1,
    objective='binary',
    metric = 'auc',
    subsample= 0.95,
    subsample_freq=1,
    colsample_bytree=0.7,
    reg_alpha=0.,
    reg_lambda=0.,
    learning_rate=0.02,
    random_state=2017
)

In [67]:
lgb.LGBMClassifier?