<a href="https://colab.research.google.com/github/rgumi/seminararbeit/blob/master/seminarpaper-sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from scipy.stats import randint, uniform
import datetime as dt
import json
import numpy as np
import pandas as pd 
import urllib.request
pd.options.mode.chained_assignment = None


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier


#import xgboost as xgb
#from xgboost.sklearn import XGBClassifier

In [0]:
mountpath = "/content/drive"
from google.colab import drive
drive.mount(mountpath)

In [11]:
ltz = {}
with urllib.request.urlopen("https://raw.githubusercontent.com/rgumi/seminararbeit/master/leitzinsen_eu.json") as url:
    tmp_ltz = json.loads(url.read().decode())
for key in tmp_ltz.keys():
  ltz[dt.datetime.strptime(key, '%d-%m-%Y')] = tmp_ltz[key]
sorted_ltz = {k: ltz[k] for k in sorted(ltz)}
print(sorted_ltz)

def get_leitzins(date):
  for key, val in sorted_ltz.items():
    if date >= key:
      last = val
      continue
    return last

def derive_age(age):
  if age <= 18:
    return 'underage'
  if age <=30:
    return 'younger'
  if age <=60:
    return 'older'
  if age > 60:
    return 'old'
    

{datetime.datetime(2007, 6, 6, 0, 0): 0.04, datetime.datetime(2008, 7, 3, 0, 0): 0.0425, datetime.datetime(2008, 10, 8, 0, 0): 0.0375, datetime.datetime(2008, 11, 6, 0, 0): 0.0325, datetime.datetime(2008, 12, 4, 0, 0): 0.025, datetime.datetime(2009, 1, 15, 0, 0): 0.02, datetime.datetime(2009, 3, 5, 0, 0): 0.015, datetime.datetime(2009, 4, 2, 0, 0): 0.0125, datetime.datetime(2009, 5, 7, 0, 0): 0.01, datetime.datetime(2011, 4, 7, 0, 0): 0.0125}


In [0]:
# shows all unique values for each column
def print_columns_unique(df):
  for (name, data) in df.iteritems():
    print(name, data.unique())

# Preprocessing: Date
def get_info_from_date(df):
  df['year'] = df.loc[:, 'date'].dt.year
  df['month'] = df.loc[:, 'date'].dt.month
  df['day'] = df.loc[:, 'date'].dt.day
  # get quarter 
  df['quarter'] = df.loc[:, 'month'] // 4 + 1
  # 6=Sonntag, 5=Samstag, 4=Freitag (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.dayofweek.html)
  # set True wenn Mo-Fr ist
  #df['is_weekday'] = df['date'].dt.dayofweek < 5
  return df

In [0]:
# setup
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/prediction-challenge/dataset.csv',
                   index_col='identifier', parse_dates=['date'])

# print_columns_unique(data)

X, y = data.iloc[:, 0:-1], data.loc[:, 'success']

# Date Preprocessing
X = get_info_from_date(X)
X['age_group'] = X['age'].apply(derive_age)
X['leitzins'] = X['date'].apply(get_leitzins)


hyperparams = {'random_state': 1909}

model = RandomForestClassifier(**hyperparams)

In [16]:
# Drop not needed features to reduce complexity and overfitting
X.drop(['communication_type'], axis=1, inplace=True)
X.drop(['date'], axis=1, inplace=True)
print(X.shape, y.shape)

# Persists current features to csv-file
# X.to_csv(mountpath + '/My Drive/seminararbeit/tmp.csv')
# print_columns_unique(X)

(37069, 18) (37069,)


In [0]:
# pipeline
categorical_features = ['age_group', 'quarter', 'marital_status', 'education', 'job', 'credit_default', 'housing_loan', 'personal_loan', 'previous_conversion']
numeric_features = ['leitzins', 'day', 'month', 'year', 'age', 'n_contacts_campaign', 'days_since_last_contact', 'n_contacts_before', 'duration']

numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])
categorical_transformer = Pipeline ([
    ('onehotencoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer([
    ('n_transformer', numeric_transformer, numeric_features),
    ('c_transformer', categorical_transformer, categorical_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])


In [0]:
# randomized hyperparameter search
custom_scorer = make_scorer(f1_score, pos_label='Yes')
n_estimators = randint(100, 1000)
max_depth = randint(100, 1500)
min_samples_leaf = randint(1, 100)
min_samples_split = randint(1, 20)
max_features = randint(1, 20)

param_distributions = { 'model__n_estimators': n_estimators, 
                        'model__max_depth': max_depth,
                        'model__min_samples_leaf': min_samples_leaf,
                        'model__min_samples_split': min_samples_split,
                        'model__max_features': max_features
                       }

rs = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=10,
                       scoring=custom_scorer, n_jobs=-1, cv=10, random_state=1909)

rs = rs.fit(X, y)

In [0]:
# run optimized model
hyperparams = {'n_estimators': rs.best_params_['model__n_estimators'], 
               'criterion': 'gini', 
               'max_depth': rs.best_params_['model__max_depth'],
               'min_samples_leaf': rs.best_params_['model__min_samples_leaf'], 
               'min_samples_split': rs.best_params_['model__min_samples_split'],
               'max_features': rs.best_params_['model__max_features'],

               'min_weight_fraction_leaf': 0.0, 
               'max_leaf_nodes': None, 
               'min_impurity_decrease': 0.0, 
               'min_impurity_split': None,
               'bootstrap': True, 'oob_score': False, 
               'n_jobs': None, 'random_state': 1909,
               'verbose': 0, 'warm_start': False, 
               'class_weight': None
               }


model = RandomForestClassifier(**hyperparams)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])
pipeline.fit(X, y)

In [0]:
# validating
res_cv = cross_validate(pipeline, X, y, scoring=custom_scorer, cv=10, return_train_score=True)
res_f1_tr = np.mean(res_cv['train_score']) * 100
res_f1_te = np.mean(res_cv['test_score']) * 100
print(hyperparams)
result = f'Average F1 on Training and Test Sets: {res_f1_tr:.2f}%/{res_f1_te:.2f}%'
print(result)


In [0]:
# persists results for later analysis

resultJSON = {
    "date": dt.now().strftime("%d.%m.%Y, %H:%M:%S"),
    "result": result,
    "hyperparams": hyperparams
}

with open(mountpath + '/My Drive/seminararbeit/results-sklearn.txt', 'a') as file:
  file.write(str(resultJSON) + '\n')