In [1]:
import pandas as pd 
from mypipes_linear import *
from sklearn.model_selection import train_test_split
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
import lightgbm as lgb

In [4]:
file=r'./rg_train.csv'
data=pd.read_csv(file)

num_vars=list(data.select_dtypes(exclude=['object']).columns)

num_vars=[_ for _ in num_vars if _ not in ['REF_NO','Revenue.Grid']]

cat_vars=list(data.select_dtypes(include=['object']).columns)

cat_vars=[_ for _ in cat_vars if _ not in 
          ['children','age_band', 'post_code','post_area','family_income']]

data_train,data_val=train_test_split(data,test_size=0.2,random_state=42)

In [6]:
p1=pdPipeline([
    ('var_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

p2=pdPipeline([
    ('var_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(70))
])

p3=pdPipeline([
    ('var_select',VarSelector(['age_band'])),
    ('custom_fico',custom_age_band()),
    ('missing_trt',DataFrameImputer())
])

p4=pdPipeline([
    ('var_select',VarSelector(['family_income'])),
    ('custom_fico',custom_family_income()),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['children'])),
    ('string_clean1',string_clean(replace_it='Zero',replace_with='0')),
    ('string_clean2',string_clean(replace_it='4+',replace_with='4')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

In [8]:
data_pipe=FeatureUnion([
    ('num',p1),
    ('cat_vars',p2),
    ('age_band',p3),
    ('family_income',p4),
    ('children',p5)
])
data_pipe.fit(data_train)

FeatureUnion(n_jobs=None,
             transformer_list=[('num',
                                pdPipeline(memory=None,
                                           steps=[('var_select',
                                                   VarSelector(feature_names=['year_last_moved',
                                                                              'Average.Credit.Card.Transaction',
                                                                              'Balance.Transfer',
                                                                              'Term.Deposit',
                                                                              'Life.Insurance',
                                                                              'Medical.Insurance',
                                                                              'Average.A.C.Balance',
                                                                              'Personal.Loan',
                     

In [9]:
x_train=pd.DataFrame(data=data_pipe.transform(data_train),
                     columns=data_pipe.get_feature_names())
x_val=pd.DataFrame(data=data_pipe.transform(data_val),
                     columns=data_pipe.get_feature_names())

y_train=(data_train['Revenue.Grid']==1).astype(int)
y_val=(data_val['Revenue.Grid']==1).astype(int)

In [10]:
clf = lgb.LGBMClassifier()
clf.fit(x_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [11]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_val,clf.predict_proba(x_val)[:,1])

0.9908372999541866