<a href="https://colab.research.google.com/github/rgumi/seminararbeit/blob/master/final_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from scipy.stats import randint, uniform
import datetime as dt
import numpy as np
import pandas as pd 
import json
import urllib.request
pd.options.mode.chained_assignment = None

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, make_scorer, confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import MinMaxScaler

from xgboost.sklearn import XGBClassifier

# Data Preprocessing


## Feature Additions

In [0]:
df_euribor = pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/euribor3m_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_euribor = df_euribor[(df_euribor['date'].dt.year >= 2007)]

def get_euribor(date):
  for i in range(0, len(df_euribor)):

    if date >= df_euribor['date'].iloc[i]:
      last = df_euribor['value'].iloc[i]
      continue
    return round(last, 2)

In [0]:
df_eurostoxx= pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/eurostoxx_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_eurostoxx = df_eurostoxx[(df_eurostoxx['date'].dt.year >= 2007)]

def get_eurostoxx(date):
  for i in range(0, len(df_eurostoxx)):

    if date >= df_eurostoxx['date'].iloc[i]:
      last = df_eurostoxx['value'].iloc[i]
      continue
    return last

In [0]:
df_fsi= pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/fsi_ref.csv', 
    index_col=['index'], parse_dates=['Date'])

df_fsi = df_fsi[(df_fsi['Date'].dt.year >= 2007)]

def get_fsi(date):
  for i in range(0, len(df_fsi)):

    if date >= df_fsi['Date'].iloc[i]:
      # Possible values: [OFR FSI, Credit, Equity valuation, Safe assets, Funding, Volatility]
      last = df_fsi['OFR FSI'].iloc[i]
      continue
    return last

In [0]:
df_cpi = pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/cpi_monthly_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_cpi = df_cpi[(df_cpi['date'].dt.year >= 2007)]

def get_cpi(date):
  for i in range(0, len(df_cpi)):

    if date >= df_cpi['date'].iloc[i]:
      last = df_cpi['value'].iloc[i]
      continue
    return last

In [0]:
df_cci = pd.read_csv(
    'https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/cci_monthly_ref.csv', 
    index_col=['index'], parse_dates=['date'])

df_cci = df_cci[(df_cci['date'].dt.year >= 2007)]

def get_cci(date):
  for i in range(0, len(df_cci)):

    if date >= df_cci['date'].iloc[i]:
      last = df_cci['value'].iloc[i]
      continue
      
    return last

In [0]:
ltz = {}
with urllib.request.urlopen("https://raw.githubusercontent.com/rgumi/seminararbeit_src/master/refined/leitzinsen_eu.json") as url:
    tmp_ltz = json.loads(url.read().decode())
for key in tmp_ltz.keys():
  ltz[dt.datetime.strptime(key, '%d-%m-%Y')] = tmp_ltz[key]
sorted_ltz = {k: ltz[k] for k in sorted(ltz)}

def get_leitzins(date):
  for key, val in sorted_ltz.items():
    if date >= key:
      last = val
      continue
    return last

# Pipeline

In [8]:
dataset = pd.read_csv(
    'https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/dataset.csv', 
    index_col='identifier', parse_dates=['date'])

dataset.insert(len(dataset.columns) -1, "weekday", dataset.date.dt.weekday)
dataset.insert(len(dataset.columns) -1, "day", dataset.date.dt.day)
dataset.insert(len(dataset.columns) -1, "month", dataset.date.dt.month)
dataset.insert(len(dataset.columns) -1, "year", dataset.date.dt.year)
dataset.insert(len(dataset.columns) -1, "quarter", dataset.date.dt.quarter)

# deleted cci (not useful)
dataset.insert(len(dataset.columns)-1, "leitzins", dataset['date'].apply(get_leitzins))
dataset.insert(len(dataset.columns)-1, "euribor", dataset['date'].apply(get_euribor))
dataset.insert(len(dataset.columns)-1, "cpi", dataset['date'].apply(get_cpi))
dataset.insert(len(dataset.columns)-1, "fsi", dataset['date'].apply(get_fsi))
dataset.insert(len(dataset.columns)-1, "eurostoxx", dataset['date'].apply(get_eurostoxx))

dataset = dataset.drop('date', axis=1)

dataset.loc[dataset['days_since_last_contact'] == -1, 'days_since_last_contact'] = 10000

dataset.columns

Index(['age', 'marital_status', 'education', 'job', 'credit_default',
       'housing_loan', 'personal_loan', 'communication_type',
       'n_contacts_campaign', 'days_since_last_contact', 'n_contacts_before',
       'previous_conversion', 'duration', 'weekday', 'day', 'month', 'year',
       'quarter', 'leitzins', 'euribor', 'cpi', 'fsi', 'eurostoxx', 'success'],
      dtype='object')

In [0]:
all_features = ['day' ,'quarter', 'education', 'job', 'age', 
             'previous_conversion', 'n_contacts_before',
             'days_since_last_contact', 'n_contacts_campaign',
             'marital_status', 'credit_default', 'duration',
             'housing_loan', 'personal_loan', 'leitzins',
             'eurostoxx', 'fsi', 'cpi']

categorical_features = ['education', 'job', 'previous_conversion',
                        'marital_status', 'quarter', 'credit_default',
                        'personal_loan', 'housing_loan']

numerical_features = ['n_contacts_campaign', 'duration', 
                  'days_since_last_contact', 'age', 'cpi',
                  'leitzins', 'eurostoxx', 'fsi', 'day']

X = dataset[all_features]
y = dataset['success']
y = y.apply(lambda x: 1 if x == "Yes" else 0)

for item in categorical_features:
  try:
    encoded = pd.get_dummies(X[item], prefix=item)
    X.drop(item, axis=1, inplace=True)
    X = X.join(encoded)
  except Exception as e:
    print("Something went wrong?!")
    print(e)
    continue

scaler = MinMaxScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Model Evaluation

The target variable of this problem is bianary (Yes/No). Therefore, we the model needs to be able to handle binary classification. The following models were selected for preselection:
## Logistic Regression
A regression (linear) model outputs a continuous target variable which is transformed to a binary using a decision boundary.
## Decision Tree (Random Forest)
A random forest is (non-linear) model which constructs multiple decision trees which bisect the datasets into different classes by generating multiple decision boundaries.
## Extreme Gradiant Boosting Classification (XGBClassifier)
The XGBClassifier is a random forest which uses gradiant optimization to construct the decision trees (weak learners)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
custom_scorer = make_scorer(f1_score, pos_label=1)

# The evaluation is done by running all models with the same preprocessed datasets
# Every model will use their default hyperparameters. No optimization has been conducted
# The evaluation is done with the F1 Score for all models using 
# a k-fold cross-validation (RepeatedStratifiedKFold)

# function which takes a model and the training data to run a 
# cross_validate with the f1-scorer
def validate_model(model, X, y):
  scoring=custom_scorer
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, 
                               random_state=1909)
  scores = cross_validate(model, X, y, scoring=scoring, cv=cv, 
                          n_jobs=-1, return_train_score=True)
  train_score = np.mean(scores["train_score"])*100
  test_score = np.mean(scores["test_score"])*100
  print(f'Mean F1 Score of {str(model).split("(")[0]}: {train_score:.2f}/{test_score:.2f}\n')
  return {
      'train': train_score,
      'test': test_score
      }

model = LogisticRegression() # default hyperparameters
print(f'Evaluating Model: {str(model).split("(")[0]}')
lgr_score = validate_model(model, X, y)

model = RandomForestClassifier() # else: default hyperparameters
print(model)
print(f'Evaluating Model: {str(model).split("(")[0]}')
rfc_score = validate_model(model, X, y)

model = XGBClassifier() # else: default hyperparameters
print(model)
print(f'Evaluating Model: {str(model).split("(")[0]}')
xgbc_score = validate_model(model, X, y)

Evaluating Model: LogisticRegression
Mean F1 Score of LogisticRegression: 47.77/47.53

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Evaluating Model: RandomForestClassifier
Mean F1 Score of RandomForestClassifier: 99.99/52.37

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,



Mean F1 Score of XGBClassifier: 58.06/56.20



# Hyperparameter optimization

In [0]:
custom_scorer = make_scorer(f1_score, pos_label=1)

def gridsearch(model, param_grid, X, y):
  search = GridSearchCV(model, param_grid=param_grid, scoring=custom_scorer,
                    n_jobs=-1, cv=4, refit=True, error_score=0).fit(X, y)

  print(f'Optimal parameters: {search.best_params_}')
  print(f'Best Score: {search.best_score_}')
  return search.best_params_

def randsearch(model, param_distr, X, y):
  search = RandomizedSearchCV(model, param_distributions=param_distr, 
                              n_iter=10, scoring=custom_scorer, n_jobs=-1, 
                              cv=10, random_state=1909).fit(X, y)

  print(f'Optimal parameters: {search.best_params_}')

  print(f'Best Score: {search.best_score_}')
  return search.best_params_

In [0]:
# basis hyperparams
hyperparams = {
    'scale_pos_weight': 9,
    'booster': 'gbtree', 
    'objective': 'binary:logistic',
    'silent': True, 
    'missing': None,
    'nthread': -1,
}

In [13]:
from scipy.stats import randint, uniform

custom_scorer = make_scorer(f1_score, pos_label=1)

n_estimators = randint(100, 150)
max_depth = randint(10, 50)
learning_rate = uniform(0.005, 0.3)
colsample_bynode = uniform(0.25, 1)
colsample_bylevel = uniform(0.25, 1)
colsample_bytree = uniform(0.25, 1)
subsample = uniform(0.25, 1)
gamma = randint(0, 5)
base_score = uniform(0.2, 0.8)
min_child_weight = uniform(0.2, 1)

param_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'learning_rate': learning_rate,
    'subsample': subsample,
    'colsample_bynode': colsample_bynode,
    'colsample_bylevel': colsample_bylevel,
    'colsample_bytree': colsample_bytree,
    'base_score': base_score,
    'gamma': gamma,
    'min_child_weight': min_child_weight,
}

best_params = randsearch(XGBClassifier(**hyperparams), 
                         param_distr=param_grid, X=X, y=y)

Optimal parameters: {'base_score': 0.30070711686028756, 'colsample_bylevel': 0.950693101530163, 'colsample_bynode': 0.40027174935474574, 'colsample_bytree': 0.7309167840661879, 'gamma': 0, 'learning_rate': 0.012986666228874295, 'max_depth': 49, 'min_child_weight': 0.4641502506772979, 'n_estimators': 140, 'subsample': 0.3930716248652387}
Best Score: 0.636426809104316


In [14]:
# with best parameters from gridsearch
f_hyperparams = {**hyperparams, **{
    'learning_rate': best_params['learning_rate'],
    'n_estimators': best_params['n_estimators'],
    'max_depth': best_params['max_depth'],
    'colsample_bylevel': best_params['colsample_bylevel'],
    'colsample_bynode': best_params['colsample_bynode'],
    'colsample_bytree': best_params['colsample_bytree'],
    'subsample': best_params['subsample'],
    'gamma': best_params['gamma'],
    'base_score': best_params['base_score'],
    'min_child_weight': best_params['min_child_weight'],
    }
}

model = XGBClassifier(**f_hyperparams)
print(model)
validate_model(model, X, y)

XGBClassifier(base_score=0.30070711686028756, booster='gbtree',
              colsample_bylevel=0.950693101530163,
              colsample_bynode=0.40027174935474574,
              colsample_bytree=0.7309167840661879, gamma=0,
              learning_rate=0.012986666228874295, max_delta_step=0,
              max_depth=49, min_child_weight=0.4641502506772979, missing=None,
              n_estimators=140, n_jobs=1, nthread=-1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=9, seed=None, silent=True,
              subsample=0.3930716248652387, verbosity=1)
Mean F1 Score of XGBClassifier: 75.61/63.76



{'test': 63.75749847789783, 'train': 75.6090351291712}

# Evaluating Model with Optimized Parameter


In [0]:
hyperparams = {'seed': 1909, 
               'learning_rate': 0.01, 
               'min_child_weight': 0.8, 
               'scale_pos_weight': 8, 
               'colsample_bylevel': 0.75, 
               'colsample_bytree': 0.65, 
               'colsample_bynode': 0.75, 
               'max_depth': 50, 
               'n_estimators': 100, 
               'gamma': 0.95, 
               'subsample': 0.25, 
               'base_score': 0.5, 
               'reg_lambda': 1, 
               'nthread': -1, 'booster': 'gbtree', 
               'objective': 'binary:logistic', 'silent': True, 
               'missing': None, 'max_delta_step': 0
}

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1909)

In [17]:
model = XGBClassifier(**f_hyperparams)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      9335
           1       0.77      0.54      0.64      1786

    accuracy                           0.90     11121
   macro avg       0.84      0.76      0.79     11121
weighted avg       0.89      0.90      0.89     11121

[[9043  292]
 [ 814  972]]


# Prediction

In [18]:
prediction_dataset = pd.read_csv(
    'https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/prediction-dataset.csv', 
    index_col='identifier', parse_dates=['date'])

prediction_dataset.insert(len(prediction_dataset.columns) -1, "weekday", prediction_dataset.date.dt.weekday)
prediction_dataset.insert(len(prediction_dataset.columns) -1, "day", prediction_dataset.date.dt.day)
prediction_dataset.insert(len(prediction_dataset.columns) -1, "month", prediction_dataset.date.dt.month)
prediction_dataset.insert(len(prediction_dataset.columns) -1, "year", prediction_dataset.date.dt.year)
prediction_dataset.insert(len(prediction_dataset.columns) -1, "quarter", prediction_dataset.date.dt.quarter)

prediction_dataset.insert(len(prediction_dataset.columns)-1, "leitzins", prediction_dataset['date'].apply(get_leitzins))
prediction_dataset.insert(len(prediction_dataset.columns)-1, "euribor", prediction_dataset['date'].apply(get_euribor))
prediction_dataset.insert(len(prediction_dataset.columns)-1, "cpi", prediction_dataset['date'].apply(get_cpi))
prediction_dataset.insert(len(prediction_dataset.columns)-1, "fsi", prediction_dataset['date'].apply(get_fsi))
prediction_dataset.insert(len(prediction_dataset.columns)-1, "eurostoxx", prediction_dataset['date'].apply(get_eurostoxx))

prediction_dataset = prediction_dataset.drop('date', axis=1)
prediction_dataset.columns

Index(['age', 'marital_status', 'education', 'job', 'credit_default',
       'housing_loan', 'personal_loan', 'communication_type',
       'n_contacts_campaign', 'days_since_last_contact', 'n_contacts_before',
       'previous_conversion', 'weekday', 'day', 'month', 'year', 'quarter',
       'leitzins', 'euribor', 'cpi', 'fsi', 'eurostoxx', 'duration'],
      dtype='object')

In [19]:
X_pred = prediction_dataset[all_features]

for item in categorical_features:
  try:
    encoded = pd.get_dummies(X_pred[item], prefix=item)
    X_pred.drop(item, axis=1, inplace=True)
    X_pred = X_pred.join(encoded)
  except Exception as e:
    print("Something went wrong?!")
    print(e)
    continue

scaler = MinMaxScaler()
X_pred[numerical_features] = scaler.fit_transform(X_pred[numerical_features])

# uff... Add missing features to the prediction dataset at the position it has
# in the training dataset
for feature in X.columns:
  if not feature in X_pred.columns:
    print(f' X_pred is missing {feature}')
    X_pred.insert(X.columns.tolist().index(feature), feature, 0)

 X_pred is missing credit_default_Yes


In [20]:
mountpath = "/content/drive"
from google.colab import drive
drive.mount(mountpath)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
model = XGBClassifier(**f_hyperparams)
model.fit(X, y)
predictions = model.predict(X_pred)

submission = pd.DataFrame(
    predictions, index=X_pred.index, columns=['prediction'])

matriculation_number = '465527'

submission.to_csv(
    f'{mountpath}/My Drive/seminararbeit/result/submission-{matriculation_number}.csv', index_label='identifier')