<a href="https://colab.research.google.com/github/rgumi/seminararbeit/blob/master/seminarpaper-0.0.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# pre processing
from scipy.stats import randint, uniform
import pandas as pd 
pd.options.mode.chained_assignment = None
import numpy as np
from sklearn.pipeline import Pipeline
import datetime as dt
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

#import xgboost as xgb
#from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, make_scorer

In [0]:
# shows all unique values for each column
def print_columns_unique(df):
  for (name, data) in df.iteritems():
    print(name, data.unique())

# Preprocessing: Date
def get_info_from_date(df):
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    # 6=Sonntag, 5=Samstag, 4=Freitag (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.dayofweek.html)
    # set True wenn Mo-Fr ist
    #df['is_weekday'] = df['date'].dt.dayofweek < 5
    df.drop(['date'], axis=1, inplace=True)
    return df

In [0]:
# setup
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/dataset.csv',
                   index_col='identifier', parse_dates=['date'])

# print_columns_unique(data)

X, y = data.iloc[1:-1], data['success']

# Date Preprocessing
X = get_info_from_date(X)


hyperparams = {'random_state': 1909}

model = RandomForestClassifier(**hyperparams)

In [110]:
# Drop not needed features to reduce complexity and overfitting
X.drop(['communication_type'], axis=1, inplace=True)
print(X.shape, y.shape)
print_columns_unique(X)

(37067, 16) (37069,)
age [32 68 39 30 48 28 42 35 59 40 58 56 49 41 36 53 34 51 50 44 47 29 45 38
 23 37 33 25 55 26 54 31 27 46 57 72 60 62 43 70 52 22 71 21 61 67 24 95
 73 63 20 83 18 85 81 19 98 82 79 76 80 84 66 64 74 65 88 75 69 86 78 77
 17 92 89 94 87 91]
marital_status ['single' 'married' 'divorced' 'unknown']
education ['University' 'Middle School' 'High School' 'Professional Training'
 'Elementary School' 'Unknown' 'Illiterate']
job ['Student' 'Pensioner' 'Administrator' 'Technician' 'Blue-collar worker'
 'Self-employed' 'Unemployed' 'Manager' 'Service provider' 'Housemaid'
 'Founder' 'Unknown']
credit_default ['No' 'Unknown' 'Yes']
housing_loan ['Yes' 'No' 'Unknown']
personal_loan ['No' 'Unknown' 'Yes']
n_contacts_campaign [ 1  3  2  6  4  7 14  9  8 23  5 26 10 11 12 18 24 13 16 21 31 35 27 22
 19 29 15 20 32 17 43 28 33 40 37 42 30 25 56 41 34 39]
days_since_last_contact [-1  6  7  0  3 14  2  1  4 13  5 11 10  9 12  8 17 16 15 18 26 19 22 27
 25 21]
n_contacts_before [0 

In [0]:
# pipeline
categorical_features = ['marital_status', 'education', 'job', 'credit_default', 'housing_loan', 'personal_loan', 'previous_conversion']
numeric_features = ['age', 'n_contacts_campaign', 'days_since_last_contact', 'n_contacts_before', 'duration']

numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])
categorical_transformer = Pipeline ([
    ('onehotencoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer([
    ('n_transformer', numeric_transformer, numeric_features),
    ('c_transformer', categorical_transformer, categorical_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])


In [112]:
# randomized hyperparameter search
custom_scorer = make_scorer(f1_score, pos_label='Yes')
n_estimators = randint(100, 600)
max_depth = randint(50, 400)

param_distributions = { 'model__n_estimators': n_estimators, 
                        'model__max_depth': max_depth
}

rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=5,
                       scoring=custom_scorer, n_jobs=-1, iid=False, cv=10, random_state=1909)

rs = rs.fit(X, y)

ValueError: ignored

In [0]:
# run optimized model
hyperparams = {'n_estimators': rs.best_params_['model__n_estimators'], 
               'criterion': 'gini', 
               'max_depth': rs.best_params_['model__max_depth'],
               'min_samples_split': 2,
               'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto',
               'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None,
               'bootstrap': True, 'oob_score': False, 'n_jobs': None, 'random_state': 1909,
               'verbose': 0, 'warm_start': False, 'class_weight': None
               }


model = RandomForestClassifier(**hyperparams)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])
pipeline.fit(X, y)

In [0]:
# validating
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
print(hyperparams)
result = f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}%/{res_f1_te:.2f}%'
print(result)


In [0]:
# persists results for later analysis
import json
from datetime import datetime as dt

mountpath = "/content/drive"
from google.colab import drive
drive.mount(mountpath)

resultJSON = {
    "date": dt.now().strftime("%d.%m.%Y, %H:%M:%S"),
    "result": result,
    "hyperparams": hyperparams
}

with open(mountpath + '/My Drive/seminararbeit/results.txt', 'a') as file:
  file.write(str(resultJSON) + '\n')
