# CatBoost

In [None]:
import pandas as pd 
from catboost import CatBoostClassifier
from mypipes_linear import *
from sklearn.model_selection import train_test_split
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
file=r'/Users/lalitsachan/Dropbox/0.0 Data/rg_train.csv'
data=pd.read_csv(file)

In [None]:
data.info()

In [None]:
num_vars=list(data.select_dtypes(exclude=['object']).columns)

num_vars=[_ for _ in num_vars if _ not in ['REF_NO','Revenue.Grid']]

In [None]:
cat_vars=list(data.select_dtypes(include=['object']).columns)

In [None]:
cat_vars=[_ for _ in cat_vars if _ not in 
          ['children','age_band', 'post_code','post_area','family_income']]

In [None]:
data_train,data_val=train_test_split(data,test_size=0.2,random_state=42)

In [None]:
p1=pdPipeline([
    ('var_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

p2=pdPipeline([
    ('var_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer())
])

p3=pdPipeline([
    ('var_select',VarSelector(['age_band'])),
    ('custom_fico',custom_age_band()),
    ('missing_trt',DataFrameImputer())
])

p4=pdPipeline([
    ('var_select',VarSelector(['family_income'])),
    ('custom_fico',custom_family_income()),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['children'])),
    ('string_clean1',string_clean(replace_it='Zero',replace_with='0')),
    ('string_clean2',string_clean(replace_it='4+',replace_with='4')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

data_pipe=FeatureUnion([
    ('num',p1),
    ('cat_vars',p2),
    ('age_band',p3),
    ('family_income',p4),
    ('children',p5)
])

In [None]:
data_pipe.fit(data_train)

In [None]:
x_train=pd.DataFrame(data=data_pipe.transform(data_train),
                     columns=data_pipe.get_feature_names())
x_val=pd.DataFrame(data=data_pipe.transform(data_val),
                     columns=data_pipe.get_feature_names())

y_train=(data_train['Revenue.Grid']==1).astype(int)
y_val=(data_val['Revenue.Grid']==1).astype(int)

In [None]:
x_train.head()

In [None]:
cat_vars_cb=['cat_vars__'+_ for _ in cat_vars]

In [None]:
clf = CatBoostClassifier(
    iterations=200,
    random_seed=42,
    learning_rate=0.5
)

clf.fit(
    x_train, y_train,
    cat_features=cat_vars_cb,
    eval_set=(x_val, y_val),
    plot=True
)

In [None]:
clf.shrink(80)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_val,clf.predict_proba(x_val)[:,1])