<a href="https://colab.research.google.com/github/rgumi/seminararbeit/blob/master/seminarpaper-xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# pre processing
from scipy.stats import randint, uniform
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
import datetime as dt
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

#import xgboost as xgb
#from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, make_scorer

In [0]:
# setup
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/dataset.csv',
                   index_col='identifier', parse_dates=['date'])
# shows all unique values for each column
#for (name, data) in data.iteritems():
#  print(name, data.unique())
X, y = data.iloc[:, 1:-1], data['success']
X.drop(['communication_type'], axis=1, inplace=True)
hyperparams = { 'random_state': 1909
}
model = RandomForestClassifier(**hyperparams)

In [0]:
# pipeline
# categorical_features = ['marital_status', 'education', 'job', 'credit_default', 'housing_loan', 'personal_loan', 'communication_type', 'previous_conversion']
categorical_features = ['marital_status', 'education', 'job', 'credit_default', 'housing_loan', 'personal_loan', 'previous_conversion']
numeric_features = ['age', 'n_contacts_campaign', 'days_since_last_contact', 'n_contacts_before']

numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])
categorical_transformer = Pipeline ([
    ('onehotencoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer([
    ('n_transformer', numeric_transformer, numeric_features),
    ('c_transformer', categorical_transformer, categorical_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])


In [34]:
# randomized hyperparameter search
custom_scorer = make_scorer(f1_score, pos_label='Yes')
n_estimators = randint(100, 600)
max_depth = randint(50, 400)

param_distributions = { 'model__n_estimators': n_estimators, 
                        'model__max_depth': max_depth
}

rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=5,
                       scoring=custom_scorer, n_jobs=-1, iid=False, cv=10, random_state=1909)

rs = rs.fit(X, y)



KeyboardInterrupt: ignored

In [0]:
# run optimized model
hyperparams = {'n_estimators': rs.best_params_['model__n_estimators'], 
               'criterion': 'gini', 
               'max_depth': rs.best_params_['model__max_depth'],
               'min_samples_split': 2,
               'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto',
               'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None,
               'bootstrap': True, 'oob_score': False, 'n_jobs': None, 'random_state': 1909,
               'verbose': 0, 'warm_start': False, 'class_weight': None
               }


model = RandomForestClassifier(**hyperparams)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])
pipeline.fit(X, y)

In [0]:
# validating
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
print(hyperparams)
result = f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}%/{res_f1_te:.2f}%'
print(result)


In [0]:
# persists results for later analysis
import json
from datetime import datetime as dt

mountpath = "/content/drive"
from google.colab import drive
drive.mount(mountpath)

resultJSON = {
    "date": dt.now().strftime("%d.%m.%Y, %H:%M:%S"),
    "result": result,
    "hyperparams": hyperparams
}

with open(mountpath + '/My Drive/seminararbeit/results.txt', 'a') as file:
  file.write(str(resultJSON) + '\n')
