<a href="https://colab.research.google.com/github/rgumi/seminararbeit/blob/master/seminarpaper-xgbClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from scipy.stats import randint
from datetime import date as d
from datetime import datetime as dt
import numpy as np
import pandas as pd 
import json
import urllib.request
pd.options.mode.chained_assignment = None


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, make_scorer
# from sklearn.ensemble import RandomForestClassifier


import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [0]:
mountpath = "/content/drive"
from google.colab import drive
drive.mount(mountpath)

In [0]:
ltz = {}
with urllib.request.urlopen("https://raw.githubusercontent.com/rgumi/seminararbeit/master/leitzinsen_eu.json") as url:
    tmp_ltz = json.loads(url.read().decode())
for key in tmp_ltz.keys():
  ltz[dt.datetime.strptime(key, '%d-%m-%Y')] = tmp_ltz[key]
sorted_ltz = {k: ltz[k] for k in sorted(ltz)}
print(sorted_ltz)

def get_leitzins(date):
  for key, val in sorted_ltz.items():
    if date >= key:
      last = val
      continue
    return last

def derive_age(age):
  if age <= 18:
    return 'underage'
  if age <=30:
    return 'younger'
  if age <=60:
    return 'older'
  if age > 60:
    return 'old'

In [0]:
# shows all unique values for each column
def print_columns_unique(df):
  for (name, data) in df.iteritems():
    print(name, data.unique())

# Preprocessing: Date
def get_info_from_date(df):
  df['year'] = df.loc[:, 'date'].dt.year
  df['month'] = df.loc[:, 'date'].dt.month
  df['day'] = df.loc[:, 'date'].dt.day
  # get quarter 
  df['quarter'] = df.loc[:, 'month'] // 4 + 1
  # 6=Sonntag, 5=Samstag, 4=Freitag (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.dayofweek.html)
  # set True wenn Mo-Fr ist
  #df['is_weekday'] = df['date'].dt.dayofweek < 5
  return df

In [0]:
# setup
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/dataset.csv',
                   index_col='identifier', parse_dates=['date'])

X, y = data.iloc[:, 0:-1], data.loc[:, 'success']

# Date Preprocessing
X = get_info_from_date(X)
X['age_group'] = X['age'].apply(derive_age)
X['leitzins'] = X['date'].apply(get_leitzins)


hyperparams = { 'seed': 1909,
                'nthread': -1
               }

model = xgb.XGBClassifier(**hyperparams)

In [0]:
# Drop not needed features to reduce complexity and overfitting
X.drop(['communication_type'], axis=1, inplace=True)
X.drop(['date'], axis=1, inplace=True)
print(X.shape, y.shape)
X.to_csv(mountpath + '/My Drive/seminararbeit/tmp.csv')
# print_columns_unique(X)

In [0]:
# pipeline
categorical_features = ['age_group', 'quarter', 'marital_status', 'education', 'job', 'credit_default', 'housing_loan', 'personal_loan', 'previous_conversion']
numeric_features = ['leitzins', 'day', 'month', 'year', 'age', 'n_contacts_campaign', 'days_since_last_contact', 'n_contacts_before', 'duration']

numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])
categorical_transformer = Pipeline ([
    ('onehotencoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer([
    ('n_transformer', numeric_transformer, numeric_features),
    ('c_transformer', categorical_transformer, categorical_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

custom_scorer = make_scorer(f1_score, pos_label='Yes')

In [0]:
# randomized hyperparameter optimization
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

n_estimators = randint(50, 400)
max_depth = randint(1, 100)
learning_rate = 0.2
min_child_weight = 0.2
gamma = 0.8
colsample_bytree = 0.8
colsample_bylevel = 1
subsample = 0.8
scale_pos_weight = 1

param_distributions = {'model__n_estimators': n_estimators, 
                       'model__max_depth': max_depth
                       }

rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=5,
                       scoring=custom_scorer, n_jobs=-1, cv=10, random_state=1909)

rs = rs.fit(X, y)
print(rs.best_params_)

In [0]:
# run optimized model
hyperparams = {'seed': 1909,
               'learning_rate': learning_rate,
               'min_child_weight': min_child_weight,
               'scale_pos_weight': scale_pos_weight,
               'colsample_bylevel': colsample_bylevel,
               'colsample_bytree': colsample_bytree,
               'max_depth': rs.best_params_['model__max_depth'],
               'n_estimators': rs.best_params_['model__n_estimators'],
               'gamma': gamma,
               'subsample': subsample,
               'base_score': 0.5,
               'nthread': -1,
               'booster': 'gbtree',
               'objective': 'binary:logistic',
               'silent': True,
               'reg_lambda': 1,
               'missing': None,
               'max_delta_step': 0,
            }

model = xgb.XGBClassifier(**hyperparams)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])
pipeline.fit(X, y)

In [0]:
# validating
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
print(hyperparams)
result = f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}%/{res_f1_te:.2f}%'
print(result)

In [0]:
# persists results for later analysis

resultJSON = {
    "date": dt.now().strftime("%d.%m.%Y, %H:%M:%S"),
    "result": result,
    "hyperparams": hyperparams
}

with open(mountpath + '/My Drive/seminararbeit/results-xgb.txt', 'a') as file:
  file.write(str(resultJSON) + '\n')
