<a href="https://colab.research.google.com/github/rgumi/seminararbeit_src/blob/master/feb_v7_dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from scipy.stats import randint, uniform
import datetime as dt
import numpy as np
import pandas as pd 
import json
import urllib.request
pd.options.mode.chained_assignment = None

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import f1_score, make_scorer, confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier, XGBRFClassifier


from imblearn.metrics import classification_report_imbalanced

# Data Preprocessing


## Feature Additions

In [0]:
df_euribor = pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/euribor3m_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_euribor = df_euribor[(df_euribor['date'].dt.year >= 2007)]

def get_euribor(date):
  for i in range(0, len(df_euribor)):

    if date >= df_euribor['date'].iloc[i]:
      last = df_euribor['value'].iloc[i]
      continue
    return round(last, 2)

In [0]:
df_eurostoxx= pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/eurostoxx_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_eurostoxx = df_eurostoxx[(df_eurostoxx['date'].dt.year >= 2007)]

def get_eurostoxx(date):
  for i in range(0, len(df_eurostoxx)):

    if date >= df_eurostoxx['date'].iloc[i]:
      last = df_eurostoxx['value'].iloc[i]
      continue
    return last

In [0]:
df_fsi= pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/fsi_ref.csv', 
    index_col=['index'], parse_dates=['Date'])

df_fsi = df_fsi[(df_fsi['Date'].dt.year >= 2007)]

def get_fsi(date):
  for i in range(0, len(df_fsi)):

    if date >= df_fsi['Date'].iloc[i]:
      # Possible values: [OFR FSI, Credit, Equity valuation, Safe assets, Funding, Volatility]
      last = df_fsi['OFR FSI'].iloc[i]
      continue
    return last

In [0]:
df_cpi = pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/cpi_monthly_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_cpi = df_cpi[(df_cpi['date'].dt.year >= 2007)]

def get_cpi(date):
  for i in range(0, len(df_cpi)):

    if date >= df_cpi['date'].iloc[i]:
      last = df_cpi['value'].iloc[i]
      continue
    return last

In [0]:
df_cci = pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/cci_monthly_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_cci = df_cci[(df_cci['date'].dt.year >= 2007)]

def get_cci(date):
  for i in range(0, len(df_cci)):

    if date >= df_cci['date'].iloc[i]:
      last = df_cci['value'].iloc[i]
      continue
      
    return last

In [0]:
ltz = {}
with urllib.request.urlopen("https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/leitzinsen_eu.json") as url:
    tmp_ltz = json.loads(url.read().decode())
for key in tmp_ltz.keys():
  ltz[dt.datetime.strptime(key, '%d-%m-%Y')] = tmp_ltz[key]
sorted_ltz = {k: ltz[k] for k in sorted(ltz)}

def get_leitzins(date):
  for key, val in sorted_ltz.items():
    if date >= key:
      last = val
      continue
    return last

In [0]:
def replace_loan(loan):
  if loan == 'Unknown':
    return 1
  if loan == 'No':
    return 2
  if loan == 'Yes':
    return 3

# Pipeline

In [0]:
dataset = pd.read_csv(
    'https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/dataset.csv', 
    index_col='identifier', parse_dates=['date'])

dataset.insert(len(dataset.columns) -1, "weekday", dataset.date.dt.weekday)
dataset.insert(len(dataset.columns) -1, "day", dataset.date.dt.day)
dataset.insert(len(dataset.columns) -1, "month", dataset.date.dt.month)
dataset.insert(len(dataset.columns) -1, "year", dataset.date.dt.year)
dataset.insert(len(dataset.columns) -1, "quarter", dataset.date.dt.quarter)

dataset.insert(len(dataset.columns)-1, "leitzins", dataset['date'].apply(get_leitzins))
dataset.insert(len(dataset.columns)-1, "euribor", dataset['date'].apply(get_euribor))
dataset.insert(len(dataset.columns)-1, "cci", dataset['date'].apply(get_cci))
dataset.insert(len(dataset.columns)-1, "cpi", dataset['date'].apply(get_cpi))
dataset.insert(len(dataset.columns)-1, "fsi", dataset['date'].apply(get_fsi))
dataset.insert(len(dataset.columns)-1, "eurostoxx", dataset['date'].apply(get_eurostoxx))

dataset = dataset.drop('date', axis=1)

dataset.loc[dataset['days_since_last_contact'] == -1, 'days_since_last_contact'] = 10000

dataset.columns

Index(['age', 'marital_status', 'education', 'job', 'credit_default',
       'housing_loan', 'personal_loan', 'communication_type',
       'n_contacts_campaign', 'days_since_last_contact', 'n_contacts_before',
       'previous_conversion', 'duration', 'weekday', 'day', 'month', 'year',
       'quarter', 'leitzins', 'euribor', 'cci', 'cpi', 'fsi', 'eurostoxx',
       'success'],
      dtype='object')

In [0]:
X = dataset[['quarter', 'education', 'job', 'age', 'previous_conversion', 'n_contacts_before',
       'days_since_last_contact', 'n_contacts_campaign', 'marital_status', 'credit_default',
       'duration', 'housing_loan', 'personal_loan',
       'euribor', 'leitzins', 'eurostoxx',
       ]]
       
y = dataset['success']
y = y.apply(lambda x: 1 if x == "Yes" else 0)

X['credit_default'] = X['credit_default'].apply(replace_loan)
X['personal_loan'] = X['personal_loan'].apply(replace_loan)
X['housing_loan'] = X['housing_loan'].apply(replace_loan)


categorical_features = ['education', 'job', 'previous_conversion',
                        'marital_status']

for item in categorical_features:
  try:
    encoded = pd.get_dummies(X[item], prefix=item)
    X.drop(item, axis=1, inplace=True)
    X = X.join(encoded)
  except Exception as e:
    print("Something went wrong?!")
    print(e)
    continue

scaler = MinMaxScaler()
numerical_features = ['n_contacts_campaign', 'duration', 
                  'days_since_last_contact', 'age',
                  'euribor', 'leitzins', 'eurostoxx', 
                  ]

X[numerical_features] = scaler.fit_transform(X[numerical_features])
X

# Hyperparameter optimization

In [0]:
custom_scorer = make_scorer(f1_score, pos_label=1)
# randomized hyperparameter optimization
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
def optimization(model):
  print(f'Model: {str(model)}')
  ## default is 3
  n_estimators = randint(90, 150)
  ## default is 100
  max_depth = randint(5, 25)
  ## default is 0.1 and 1 respectively
  learning_rate = uniform()

  # Sampling
  ## default is 1
  colsample_bytree = uniform()
  ## default is 1
  colsample_bylevel = uniform()
  ## default is 1
  colsample_bynode = uniform()

  # default is 1
  min_child_weight = uniform()


  gamma = uniform()
  base_score = uniform()
  subsample = uniform()

  # optimization parameters
  param_distributions = {'model__n_estimators': n_estimators,
                         'model__max_depth': max_depth,
                         'model__gamma': gamma,
                         'model__min_child_weight': min_child_weight,
                         'model__base_score': base_score,
                         'model__subsample': subsample,
                         'model__colsample_bylevel': colsample_bylevel,
                         'model__colsample_bytree': colsample_bytree,
                         'model__colsample_bynode': colsample_bynode,
                        }
                        
  search = RandomizedSearchCV(model, param_distributions=param_distributions, n_iter=20,
                        scoring=custom_scorer, n_jobs=-1, cv=10, random_state=1909)
  
  search = search.fit(X, y)
  return search.best_params_

In [0]:
from xgboost import XGBClassifier

hyperparams = {'seed': 1909,
                'nthread': -1,
                'booster': 'gbtree',
                'objective': 'binary:logistic',
                'silent': True,
                'reg_lambda': 1,
                'missing': None,
                'max_delta_step': 0,
                'n_estimators': 94,
                'max_deph': 11,
                'gamma': 0,
                'learning_rate': 0.3,
                'base_score': 0.5,
                'min_child_weight': 1,
                'scale_pos_weight': 9,
                'subsample': 1,
                'colsample_bylevel': 1,
                'colsample_bytree': 1,
                'colsample_bynode': 1,
                'reg_lambda': 1,
              }

xg_classifier = XGBClassifier(**hyperparams)
xg_classifier_params = optimization(xg_classifier)

print(xg_classifier_params)

# Evaluating Model with Optimized Parameter

In [0]:
hyperparams = {'seed': 1909, 
               'learning_rate': 0.3, 
               'min_child_weight': xg_classifier_params['model__min_child_weight'], 
               'scale_pos_weight': 8, 
               'colsample_bylevel': 0.8, # xg_classifier_params['model__colsample_bylevel'], 
               'colsample_bytree': xg_classifier_params['model__colsample_bytree'], 
               'colsample_bynode': xg_classifier_params['model__colsample_bynode'], 
               'max_depth': xg_classifier_params['model__max_depth'], 
               'n_estimators': xg_classifier_params['model__n_estimators'], 
               'gamma': xg_classifier_params['model__gamma'], 
               'subsample': xg_classifier_params['model__subsample'], 
               'base_score': xg_classifier_params['model__base_score'], 
               'reg_lambda': 1, 
               'nthread': -1, 'booster': 'gbtree', 
               'objective': 'binary:logistic', 'silent': True, 
               'missing': None, 'max_delta_step': 0
}

model = XGBClassifier(**hyperparams)

In [0]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1909)
scores = cross_validate(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1, return_train_score=True)
print(f'Mean Roc_Auc Score: {np.mean(scores["train_score"])*100:.2f}/{np.mean(scores["test_score"])*100:.2f}')

In [0]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1909)
scores = cross_validate(model, X, y, scoring='f1', cv=cv, n_jobs=-1, return_train_score=True)
print(f'Mean F1 Score: {np.mean(scores["train_score"])*100:.2f}/{np.mean(scores["test_score"])*100:.2f}')

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1909)

In [0]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

# Test Oversampling

In [0]:
def roc_cv_score(model, X, y):
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1909)
  scores = cross_validate(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1, return_train_score=True)
  print(f'Mean Roc_Auc Score: {np.mean(scores["train_score"])*100:.2f}/{np.mean(scores["test_score"])*100:.2f}')

In [0]:
def f1_cv_score(model, X, y):
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1909)
  scores = cross_validate(model, X, y, scoring='f1', cv=cv, n_jobs=-1, return_train_score=True)
  print(f'Mean F1 Score: {np.mean(scores["train_score"])*100:.2f}/{np.mean(scores["test_score"])*100:.2f}')

In [0]:
hyperparams = {
    'seed': 1909,
    'nthread': -1,
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'silent': True,
    'reg_lambda': 1,
    'missing': None,
    'max_delta_step': 0,
    'n_estimators': 250,
    'max_deph': 3,
    'gamma': 0,
    'learning_rate': 0.12,
    'base_score': 0.5,
    'min_child_weight': 1,
    'scale_pos_weight': 1,
    'subsample': 1,
    'colsample_bylevel': 1,
    'colsample_bytree': 1,
    'colsample_bynode': 1,
    'reg_lambda': 1,
}



hyperparams = {
    'n_estimators': 150,
    'max_deph': 8,
    'learning_rate': 0.10,
    'base_score': 0.5,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.6,
    'colsample_bylevel': 1,
    'colsample_bytree': 1,
    'colsample_bynode': 1,
    'reg_lambda': 1,
    'max_delta_step ': 0,
    'scale_pos_weight': 1,
}


hyperparams = {
    'n_estimators': 100,
    'max_deph': 50,
    'learning_rate': 0.2,
    'base_score': 0.5,
    'min_child_weight': 0.8,
    'gamma': 2,
    'subsample': 0.25,
    'colsample_bylevel': 0.75,
    'colsample_bytree': 0.65,
    'colsample_bynode': 0.75,
    'reg_lambda': 1,
    'max_delta_step ': 0,
    'scale_pos_weight': 1,
}

In [0]:
 from imblearn.over_sampling import SMOTE, ADASYN
 from collections import Counter

 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1909)

print(sorted(Counter(y_train).items()))
print(sorted(Counter(y_test).items()))

X_resampled, y_resampled = ADASYN(random_state = 1909).fit_resample(X_train, y_train)
X_r_train = pd.DataFrame(X_resampled, columns=X.columns)
print(sorted(Counter(y_resampled).items()))

In [0]:
model = XGBClassifier(**hyperparams)
model.fit(X_r_train, y_resampled)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

In [0]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#model = RandomForestRegressor(criterion= 'mse')
model = RandomForestClassifier(n_estimators=150, max_depth=10)
f1_cv_score(model, X, y)
model.fit(X_r_train, y_resampled)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

Mean F1 Score: 54.69/42.73
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      8789
           1       0.87      0.47      0.61      2332

    accuracy                           0.87     11121
   macro avg       0.87      0.73      0.77     11121
weighted avg       0.87      0.87      0.86     11121

[[8621  168]
 [1236 1096]]


In [0]:
_y_pred = []
for x in y_pred:
  a = 0
  if x > 0.6:
    a = 1
  _y_pred.append(a)
print(classification_report(_y_pred, y_test))
print(confusion_matrix(_y_pred, y_test))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94      9925
           1       0.53      0.56      0.55      1196

    accuracy                           0.90     11121
   macro avg       0.74      0.75      0.75     11121
weighted avg       0.90      0.90      0.90     11121

[[9336  589]
 [ 521  675]]


In [0]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1909)
print(sorted(Counter(y_train).items()))
print(sorted(Counter(y_test).items()))

X_resampled, y_resampled = SMOTETomek(random_state = 1909).fit_resample(X_train, y_train)
X_r_train = pd.DataFrame(X_resampled, columns=X.columns)
print(sorted(Counter(y_resampled).items()))


model = XGBClassifier(**hyperparams)
model.fit(X_r_train, y_resampled)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

[(0, 23036), (1, 2912)]
[(0, 9857), (1, 1264)]




[(0, 22767), (1, 22767)]
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      9415
           1       0.75      0.55      0.64      1706

    accuracy                           0.90     11121
   macro avg       0.83      0.76      0.79     11121
weighted avg       0.90      0.90      0.90     11121

[[9094  321]
 [ 763  943]]


# Test Undersampling

In [0]:
from imblearn.under_sampling import RandomUnderSampler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1909)
print(sorted(Counter(y_train).items()))
print(sorted(Counter(y_test).items()))

X_resampled, y_resampled = RandomUnderSampler(random_state = 1909).fit_resample(X_train, y_train)
X_r_train = pd.DataFrame(X_resampled, columns=X.columns)
print(sorted(Counter(y_resampled).items()))

model = XGBClassifier(**hyperparams)
model.fit(X_r_train, y_resampled)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

In [0]:
from imblearn.under_sampling import ClusterCentroids

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1909)
print(sorted(Counter(y_train).items()))
print(sorted(Counter(y_test).items()))

X_resampled, y_resampled = ClusterCentroids(random_state = 1909).fit_resample(X_train, y_train)
X_r_train = pd.DataFrame(X_resampled, columns=X.columns)
print(sorted(Counter(y_resampled).items()))

model = XGBClassifier(**hyperparams)
model.fit(X_r_train, y_resampled)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))