<a href="https://colab.research.google.com/github/rgumi/dataScience/blob/master/seminarpaper-xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# pre processing
from scipy.stats import randint, uniform
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
import datetime as dt
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, make_scorer

In [57]:
# setup
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/dataset.csv')
X, y = data.iloc[:, 1:-1], data['success']
X['date'] = pd.to_timedelta(pd.to_datetime(X['date'])).dt.days

hyperparams = { 'random_state': 1909,
                'nthread': -1
}
model = xgb.XGBClassifier()

  return func(*args, **kwargs)


In [0]:
# pipeline
categorical_features = ['marital_status', 'education', 'job', 'credit_default', 'housing_loan', 'personal_loan', 'communication_type', 'previous_conversion']
numeric_features = ['date', 'age', 'n_contacts_campaign', 'days_since_last_contact', 'n_contacts_before']

numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])
categorical_transformer = Pipeline ([
    ('onehotencoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer([
    ('n_transformer', numeric_transformer, numeric_features),
    ('c_transformer', categorical_transformer, categorical_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])


In [0]:
# randomized hyperparameter search
custom_scorer = make_scorer(f1_score, pos_label='Yes')
n_estimators = randint(200, 400)
max_depth = randint(50, 120)
learning_rate = 0.0002

param_distributions = { 'model__n_estimators': n_estimators, 
                        'model__max_depth': max_depth
}

# rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=5,
#                       scoring=custom_scorer, n_jobs=-1, iid=False, cv=10, random_state=1909)

# rs = rs.fit(X, y)

In [60]:
# run optimized model
hyperparams = { 'random_state': 1909,
                'learning_rate': learning_rate,
                'max_depth': 500, # rs.best_params_['model__max_depth'],
                'n_estimators': 400, # rs.best_params_['model__n_estimators'],
                'nthread': -1,
                'subsample': 1,
                'colsample_bytree': 1,
                'objective': 'binary:logistic',
                'gamma': 0,
                'reg_alpha': 0,
                'reg_lambda': 1
            }
model = xgb.XGBClassifier(**hyperparams)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])
pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('n_transformer',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   MinMaxScaler(copy=True,
                                                                                feature_range=(0,
                                                                                               1)))],
                                                           verbose=False),
                                                  ['date', 'age',
                                                   'n_contacts_campaign',
                                       

In [61]:
# validating
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
print(hyperparams)
print(f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}%/{res_f1_te:.2f}%')

{'random_state': 1909, 'learning_rate': 0.0002, 'max_depth': 500, 'n_estimators': 400, 'nthread': -1, 'subsample': 1, 'colsample_bytree': 1, 'objective': 'binary:logistic', 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
Average F1 on Training and Test Sets: 55.24%/36.48%
