<a href="https://colab.research.google.com/github/rgumi/seminararbeit_src/blob/master/feb_final_dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from scipy.stats import randint, uniform
import datetime as dt
import numpy as np
import pandas as pd 
import json
import urllib.request
pd.options.mode.chained_assignment = None

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, make_scorer, confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

from xgboost.sklearn import XGBClassifier
from xgboost import XGBRFClassifier

In [0]:
mountpath = "/content/drive"
from google.colab import drive
drive.mount(mountpath)

# Data Preprocessing


## Feature Additions

In [0]:
df_euribor = pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/euribor3m_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_euribor = df_euribor[(df_euribor['date'].dt.year >= 2007)]

def get_euribor(date):
  for i in range(0, len(df_euribor)):

    if date >= df_euribor['date'].iloc[i]:
      last = df_euribor['value'].iloc[i]
      continue
    return round(last, 2)

In [0]:
df_eurostoxx= pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/eurostoxx_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_eurostoxx = df_eurostoxx[(df_eurostoxx['date'].dt.year >= 2007)]

def get_eurostoxx(date):
  for i in range(0, len(df_eurostoxx)):

    if date >= df_eurostoxx['date'].iloc[i]:
      last = df_eurostoxx['value'].iloc[i]
      continue
    return last

In [0]:
df_fsi= pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/fsi_ref.csv', 
    index_col=['index'], parse_dates=['Date'])

df_fsi = df_fsi[(df_fsi['Date'].dt.year >= 2007)]

def get_fsi(date):
  for i in range(0, len(df_fsi)):

    if date >= df_fsi['Date'].iloc[i]:
      # Possible values: [OFR FSI, Credit, Equity valuation, Safe assets, Funding, Volatility]
      last = df_fsi['OFR FSI'].iloc[i]
      continue
    return last

In [0]:
df_cpi = pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/cpi_monthly_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_cpi = df_cpi[(df_cpi['date'].dt.year >= 2007)]

def get_cpi(date):
  for i in range(0, len(df_cpi)):

    if date >= df_cpi['date'].iloc[i]:
      last = df_cpi['value'].iloc[i]
      continue
    return last

In [0]:
df_cci = pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/cci_monthly_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_cci = df_cci[(df_cci['date'].dt.year >= 2007)]

def get_cci(date):
  for i in range(0, len(df_cci)):

    if date >= df_cci['date'].iloc[i]:
      last = df_cci['value'].iloc[i]
      continue
      
    return last

In [0]:
ltz = {}
with urllib.request.urlopen("https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/leitzinsen_eu.json") as url:
    tmp_ltz = json.loads(url.read().decode())
for key in tmp_ltz.keys():
  ltz[dt.datetime.strptime(key, '%d-%m-%Y')] = tmp_ltz[key]
sorted_ltz = {k: ltz[k] for k in sorted(ltz)}

def get_leitzins(date):
  for key, val in sorted_ltz.items():
    if date >= key:
      last = val
      continue
    return last

In [0]:
def replace_loan(loan):
  if loan == 'Unknown':
    return 1
  if loan == 'No':
    return 2
  if loan == 'Yes':
    return 3

# Pipeline

In [11]:
dataset = pd.read_csv(
    'https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/dataset.csv', 
    index_col='identifier', parse_dates=['date'])

dataset.insert(len(dataset.columns) -1, "weekday", dataset.date.dt.weekday)
dataset.insert(len(dataset.columns) -1, "day", dataset.date.dt.day)
dataset.insert(len(dataset.columns) -1, "month", dataset.date.dt.month)
dataset.insert(len(dataset.columns) -1, "year", dataset.date.dt.year)
dataset.insert(len(dataset.columns) -1, "quarter", dataset.date.dt.quarter)

dataset.insert(len(dataset.columns)-1, "leitzins", dataset['date'].apply(get_leitzins))
dataset.insert(len(dataset.columns)-1, "euribor", dataset['date'].apply(get_euribor))
dataset.insert(len(dataset.columns)-1, "cpi", dataset['date'].apply(get_cpi))
dataset.insert(len(dataset.columns)-1, "fsi", dataset['date'].apply(get_fsi))
dataset.insert(len(dataset.columns)-1, "eurostoxx", dataset['date'].apply(get_eurostoxx))

dataset = dataset.drop('date', axis=1)

dataset.loc[dataset['days_since_last_contact'] == -1, 'days_since_last_contact'] = 10000

dataset.columns

Index(['age', 'marital_status', 'education', 'job', 'credit_default',
       'housing_loan', 'personal_loan', 'communication_type',
       'n_contacts_campaign', 'days_since_last_contact', 'n_contacts_before',
       'previous_conversion', 'duration', 'weekday', 'day', 'month', 'year',
       'quarter', 'leitzins', 'euribor', 'cci', 'cpi', 'fsi', 'eurostoxx',
       'success'],
      dtype='object')

In [0]:
X = dataset[['quarter', 'education', 'job', 'age', 'previous_conversion', 'n_contacts_before',
       'days_since_last_contact', 'n_contacts_campaign', 'marital_status', 'credit_default',
       'duration', 'housing_loan', 'personal_loan',
       'euribor', 'leitzins', 'eurostoxx', 'fsi']]
       
y = dataset['success']
y = y.apply(lambda x: 1 if x == "Yes" else 0)

#X['credit_default'] = X['credit_default'].apply(replace_loan)
#X['personal_loan'] = X['personal_loan'].apply(replace_loan)
#X['housing_loan'] = X['housing_loan'].apply(replace_loan)


categorical_features = ['education', 'job', 'previous_conversion',
                        'marital_status', 'quarter', 'credit_default',
                        'personal_loan', 'housing_loan']

for item in categorical_features:
  try:
    encoded = pd.get_dummies(X[item], prefix=item)
    X.drop(item, axis=1, inplace=True)
    X = X.join(encoded)
  except Exception as e:
    print("Something went wrong?!")
    print(e)
    continue

scaler = MinMaxScaler()
numerical_features = ['n_contacts_campaign', 'duration', 
                  'days_since_last_contact', 'age',
                  'euribor', 'leitzins', 'eurostoxx', 'fsi']


X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Hyperparameter optimization

In [0]:
custom_scorer = make_scorer(f1_score, pos_label=1)
# randomized hyperparameter optimization
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
def rand_search(model, param_distributions):
  print(f'Model: {str(model)}')

  search = RandomizedSearchCV(model, param_distributions=param_distributions, n_iter=5,
                        scoring=custom_scorer, n_jobs=-1, cv=10, random_state=1909)
  search = search.fit(X, y)
  print(search.best_params_)
  return search.best_params_

In [0]:
from xgboost import XGBClassifier

hyperparams = {'seed': 1909,
                'nthread': -1,
                'booster': 'gbtree',
                'objective': 'binary:logistic',
                'silent': True,
                'reg_lambda': 1,
                'missing': None,
                'max_delta_step': 0,
                'n_estimators': 94,
                'max_deph': 11,
                'gamma': 0,
                'learning_rate': 0.3,
                'base_score': 0.5,
                'min_child_weight': 1,
                'scale_pos_weight': 9,
                'subsample': 1,
                'colsample_bylevel': 1,
                'colsample_bytree': 1,
                'colsample_bynode': 1,
                'reg_lambda': 1,
              }

model = XGBClassifier(**hyperparams)
#best_params = optimization(model)

In [0]:
custom_scorer = make_scorer(f1_score, pos_label=1)

def gridsearch(model, param_grid, X, y):
  search = GridSearchCV(model, param_grid=param_grid, scoring=custom_scorer,
                    n_jobs=-1, cv=4, refit=True, error_score=0, iid=False).fit(X, y)

  print(f'Optimal parameters: {search.best_params_}')
  print(search.best_score_)
  return search.best_params_

In [101]:
# 1st iteration: learning rate
learning_rate = [0.2, 0.3, 0.5]
param_grid = {
    'learning_rate': learning_rate,
}

best_params = gridsearch(model=XGBClassifier(), 
                         param_grid=param_grid, X=X, y=y)



Optimal parameters: {'learning_rate': 0.3}
0.5723891265618569


In [102]:
# 2nd iteration: max_depth & n_estimators
n_estimators = [80, 100, 120]
max_depth = [6, 12, 20]
hyperparams = {
    'learning_rate': best_params['learning_rate'],
}
param_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
}

best_params = gridsearch(model=XGBClassifier(**hyperparams), 
                         param_grid=param_grid, X=X, y=y)



Optimal parameters: {'max_depth': 6, 'n_estimators': 80}
0.5627577581384152


In [103]:
# 3rd iteration: subsampling

colsample_bytree = [0.5, 0.6, 0.7]
colsample_bylevel = [0.6, 0.75, 0.8]
colsample_bynode = [0.6, 0.75, 0.8]
subsample = [0.3, 0.6, 0.8]

param_grid = {
    'colsample_bytree': colsample_bytree,
    'subsample': subsample,
    
}
# with best parameters from (1st)
hyperparams = { **hyperparams, **{
    'n_estimator': best_params['n_estimators'],
    'max_depth': best_params['max_depth'],
    }
}

best_params = gridsearch(model=XGBClassifier(**hyperparams),
                         param_grid=param_grid, X=X, y=y)



Optimal parameters: {'colsample_bytree': 0.5, 'subsample': 0.8}
0.5617569720941052


In [104]:
# last iteration: learning rate
learning_rate = [0.15, 0.1, 0.05]
param_grid = {
    'learning_rate': learning_rate,
}

hyperparams = { **hyperparams, **{
    'colsample_bytree': best_params['colsample_bytree'],
    'subsample': best_params['subsample'],
    }
}

best_params = gridsearch(model=XGBClassifier(**hyperparams), 
                         param_grid=param_grid, X=X, y=y)



Optimal parameters: {'learning_rate': 0.1}
0.5678690432195094


In [0]:
# with best parameters from gridsearch
f_hyperparams = { **hyperparams, **{
    'learning_rate': 0.1,
    'n_estimators': 100,
    'max_depth': 35,
    'scale_pos_weight': 9,
    'min_child_weight': 0.8, 
    'colsample_bylevel': 0.75,
    'colsample_bynode': 0.75,
    'colsample_bytree': 0.75,
    'booster': 'gbtree', 
    'objective': 'binary:logistic',
    'silent': True, 
    'missing': None,
    }
}

model = XGBClassifier(**f_hyperparams)
print(model)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1909)
scores = cross_validate(model, X, y, scoring='f1', cv=cv, n_jobs=-1, return_train_score=True)
print(f'Mean F1 Score: {np.mean(scores["train_score"])*100:.2f}/{np.mean(scores["test_score"])*100:.2f}')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
              colsample_bynode=0.75, colsample_bytree=0.75, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=35,
              min_child_weight=0.8, missing=None, n_estimator=80,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=9, seed=None, silent=True,
              subsample=0.8, verbosity=1)


# Evaluating Model with Optimized Parameter

In [0]:
hyperparams = {'seed': 1909, 
               'learning_rate': 0.01, 
               'min_child_weight': 0.8, 
               'scale_pos_weight': 8, 
               'colsample_bylevel': 0.75, 
               'colsample_bytree': 0.65, 
               'colsample_bynode': 0.75, 
               'max_depth': 50, 
               'n_estimators': 100, 
               'gamma': 0.95, 
               'subsample': 0.25, 
               'base_score': 0.5, 
               'reg_lambda': 1, 
               'nthread': -1, 'booster': 'gbtree', 
               'objective': 'binary:logistic', 'silent': True, 
               'missing': None, 'max_delta_step': 0
}

model = XGBClassifier(**hyperparams)

In [0]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1909)
scores = cross_validate(model, X, y, scoring='f1', cv=cv, n_jobs=-1, return_train_score=True)
print(f'Mean F1 Score: {np.mean(scores["train_score"])*100:.2f}/{np.mean(scores["test_score"])*100:.2f}')

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1909)

In [0]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))