<a href="https://colab.research.google.com/github/rgumi/seminararbeit/blob/master/xgboost-base-1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from scipy.stats import randint
from datetime import date as d
from datetime import datetime as dt
import numpy as np
import pandas as pd 
import json
import urllib.request
pd.options.mode.chained_assignment = None


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, make_scorer
# from sklearn.ensemble import RandomForestClassifier


import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [0]:
mountpath = "/content/drive"
from google.colab import drive
drive.mount(mountpath)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Data Import

In [0]:
dataset = pd.read_csv(
    'https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/' +
    'master/datasets/prediction-challenge/dataset.csv', 
    index_col='identifier', parse_dates=['date'])

prediction_dataset = pd.read_csv(
    'https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/' +
    'master/datasets/prediction-challenge/prediction-dataset.csv', 
    index_col='identifier', parse_dates=['date'])

Funktion for Feature Engineering

In [0]:
def derive_age(age):
  if age <=26:
    return "jung"
  if age <=41:
    return "middel"
  if age > 41:
    return "old"

def derive_dslc(days_since_last_contact):
  if days_since_last_contact == -1:
    return None
  else:
    return days_since_last_contact

def derive_marital(marital_status):
  if marital_status == 'Unknown':
    return "NaN"
  else:
    return marital_status


def derive_credit(credit_default):
  if credit_default == 'Unknown':
    return "NaN"
  else:
    return credit_default

def derive_housing(housing_loan):
  if housing_loan == 'Unknown':
    return "NaN"
  else:
    return housing_loan

def derive_personal(personal_loan):
  if personal_loan == 'Unknown':
    return "NaN"
  else:
    return personal_loan

Feature Engineering

In [0]:
# Training
dataset.insert(len(dataset.columns)-1, "Weekday", dataset.date.dt.weekday)
dataset.insert(len(dataset.columns)-1, "Month", dataset.date.dt.month)
dataset.insert(len(dataset.columns)-1, "Year", dataset.date.dt.year)
dataset.insert(len(dataset.columns)-1, "age_group", dataset['age'].apply(derive_age))
dataset.insert(len(dataset.columns)-1, "quarter", dataset.date.dt.quarter)
dataset.insert(len(dataset.columns)-1, "duration_per_Contract", dataset['duration']/dataset['n_contacts_campaign'])
dataset.drop(['date'], axis=1, inplace=True)

# Test
prediction_dataset.insert(len(prediction_dataset.columns), "Weekday",prediction_dataset.date.dt.weekday)
prediction_dataset.insert(len(prediction_dataset.columns), "Month", prediction_dataset.date.dt.month)
prediction_dataset.insert(len(prediction_dataset.columns), "Year", prediction_dataset.date.dt.year)
prediction_dataset.insert(len(prediction_dataset.columns), "age_group", prediction_dataset['age'].apply(derive_age))
prediction_dataset.insert(len(prediction_dataset.columns), "quarter", prediction_dataset.date.dt.quarter)
prediction_dataset.insert(len(prediction_dataset.columns), "duration_per_Contract", prediction_dataset['duration']/prediction_dataset['n_contacts_campaign'])
prediction_dataset.drop(['date'], axis=1, inplace=True)

Set Unknown to None

In [0]:
#dataset['days_since_last_contact'] = dataset['days_since_last_contact'].apply(derive_dslc)
#dataset['credit_default'] = dataset['credit_default'].apply(derive_credit)
#dataset['housing_loan'] = dataset['housing_loan'].apply(derive_housing)
#dataset['personal_loan'] = dataset['personal_loan'].apply(derive_personal)
#dataset['marital_status'] = dataset['marital_status'].apply(derive_marital)
#dataset

Model, Pipeline and Scoring Initialization

In [0]:
X, y = dataset.iloc[:, 0:-1], dataset['success']

Pipeline

In [0]:
hyperparams = { 'seed': 1909,
                'nthread': -1
               }

model = xgb.XGBClassifier(**hyperparams)

In [0]:
numerical_features = ["age","n_contacts_before", "duration","quarter"]
numerical_transformer = Pipeline([
      ('scaler', MinMaxScaler()),
])

categorical_features = ["age_group", "days_since_last_contact","communication_type", "credit_default","housing_loan", "personal_loan",
                        "education","job", "Weekday","Month","Year"]
categorical_transformer = Pipeline([
      #('features', MissingIndicator(missing_values=-1)),
      ('onehotencoder', OneHotEncoder(handle_unknown = 'ignore')),
])

ordinal_features = ["marital_status","previous_conversion"]
ordinal_transformer = Pipeline([
      #('features', MissingIndicator(missing_values="Unknown")),
      ('ordinalencoder', OrdinalEncoder()),
])

preprocessor = ColumnTransformer([
      ('numerical_transformer', numerical_transformer, numerical_features),
      ('categorical_transformer', categorical_transformer, categorical_features),
      ('ordinal_transformer', ordinal_transformer, ordinal_features),
])
pipeline = Pipeline([
      ('preprocessor', preprocessor),
      ('model',model),            
])

pipeline.fit(X,y)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical_transformer',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   MinMaxScaler(copy=True,
                                                                                feature_range=(0,
                                                                                               1)))],
                                                           verbose=False),
                                                  ['age', 'n_contacts_before',
                                                   'duration', 'quarter']),
                

In [0]:
# randomized hyperparameter optimization
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

custom_scorer = make_scorer(f1_score, pos_label='Yes')
n_estimators = randint(50, 150)
max_depth = randint(15, 40)
learning_rate = 0.1

min_child_weight = 0.1
gamma = 1
colsample_bytree = 0.5
colsample_bylevel = 0.5
colsample_bynode = 0.5
subsample = 0.6
scale_pos_weight = 9
base_score = 0.5
reg_lambda = 2


hyperparams = {'seed': 1909,
               'learning_rate': learning_rate,
               'min_child_weight': min_child_weight,
               'scale_pos_weight': scale_pos_weight,
               'colsample_bylevel': colsample_bylevel,
               'colsample_bytree': colsample_bytree,
               'colsample_bynode': colsample_bynode,
               'gamma': gamma,
               'subsample': subsample,
               'base_score': base_score,
               'nthread': -1,
               'booster': 'gbtree',
               'objective': 'binary:logistic',
               'silent': True,
               'reg_lambda': reg_lambda,
               'missing': None,
               'max_delta_step': 0
            }

param_distributions = { 'model__n_estimators': n_estimators,
                        'model__max_depth': max_depth
                      }

model = xgb.XGBClassifier(**hyperparams)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])

rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=5,
                       scoring=custom_scorer, n_jobs=-1, cv=10, random_state=1909)

# rs = rs.fit(X, y)
# print(rs.best_params_)

In [0]:
# run optimized model
hyperparams = {'seed': 1909,
               'learning_rate': learning_rate,
               'min_child_weight': min_child_weight,
               'scale_pos_weight': scale_pos_weight,
               'colsample_bylevel': colsample_bylevel,
               'colsample_bytree': colsample_bytree,
               'colsample_bynode': colsample_bynode,
               'max_depth': 28, # rs.best_params_['model__max_depth'], # 28
               'n_estimators': 65, # rs.best_params_['model__n_estimators'], # 65
               'gamma': gamma,
               'subsample': subsample,
               'base_score': base_score,
               'nthread': -1,
               'booster': 'gbtree',
               'objective': 'binary:logistic',
               'silent': True,
               'reg_lambda': reg_lambda,
               'missing': None,
               'max_delta_step': 0
            }

model = xgb.XGBClassifier(**hyperparams)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])
pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical_transformer',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   MinMaxScaler(copy=True,
                                                                                feature_range=(0,
                                                                                               1)))],
                                                           verbose=False),
                                                  ['age', 'n_contacts_before',
                                                   'duration', 'quarter']),
                

In [0]:
# validating
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
print(hyperparams)
result = f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}%/{res_f1_te:.2f}%'
print(result)

In [0]:
# persists results for later analysis

resultJSON = {
    "date": dt.now().strftime("%d.%m.%Y, %H:%M:%S"),
    "result": result,
    "hyperparams": hyperparams
}

with open(mountpath + '/My Drive/seminararbeit/results-xgb.txt', 'a') as file:
  file.write(str(resultJSON) + '\n')


Submission Dataset Preparation

In [0]:
submission = pd.DataFrame(
    predictions, index=prediction_dataset.index, columns=['prediction'])

In [0]:
matriculation_number = '465530'

In [0]:
submission.to_csv(
    f'./submission-' + matriculation_number +'.csv', index_label='identifier')