In [1]:
from functools import partial
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK 
from hyperopt.pyll import scope

%matplotlib inline

# random state
RS = 1 

In [2]:
df = pd.read_csv('adult.data.csv')
df.drop_duplicates(inplace=True, ignore_index=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
X = df.drop(['salary', 'native-country'], axis=1).copy()
y = df['salary'].map({'<=50K':0,'>50K':1}).values

# pipeline for features

num_columns = np.where(X.dtypes != 'object')[0]
cat_columns = np.where(X.dtypes == 'object')[0]

cat_pipe = Pipeline([('imputer', SimpleImputer(missing_values='?', strategy='most_frequent')),
                     ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
num_pipe = Pipeline([('scaler', StandardScaler())])
transformer = ColumnTransformer(
                           transformers=[('cat', cat_pipe, cat_columns),
                                         ('num', num_pipe, num_columns)], 
                                         remainder='passthrough') 

In [4]:
# search space for algorithms

search_space = {'clf_type': hp.choice('clf_type', 
                    [{'type': 'lr', 
                      'params':{
                              'penalty': hp.choice('penalty', options=['l1', 'l2']),
                              'C': hp.loguniform('C', low=-4*np.log(10), 
                                                      high=2*np.log(10))
                               }
                     }, 
                      
                    {'type': 'dt', 
                      'params':{
                              'max_depth': scope.int(hp.uniform('max_depth', 
                                                                low=4, high=15)),
                              'max_features': hp.uniform('max_features', 
                                                                low=0.25, high=0.9)
                              }
                     },
                     
                    {'type': 'knn', 
                      'params':{
                              'n_neighbors': scope.int(hp.uniform('n_neighbors', 
                                                                  low=10, high=31)),
                              'p' : scope.int(hp.uniform('p', 
                                                         low=1, high=5)),
                              }
                     }  
                 ])}

In [5]:
# build model for target function

def buld_model(parameters, transformer, r_state):
    """
    Формируем модель с указанными гиперпараметрами 
    
    :parameters: параметры
    :transformer: column transformer для модели
    :r_state: random state
    :return: модель
    """ 
    
    if parameters['clf_type']['type'] == 'lr':
        model = Pipeline([('transformer', transformer),
                          ('clf', LogisticRegression(random_state=r_state,
                                                     solver='liblinear', n_jobs=-1, 
                                                **parameters['clf_type']['params']))])
    
    elif parameters['clf_type']['type'] == 'dt':
        model = Pipeline([('transformer', transformer),
                          ('clf', DecisionTreeClassifier(random_state=r_state,
                                                **parameters['clf_type']['params']))])
        
    elif parameters['clf_type']['type'] == 'knn':
        model = Pipeline([('transformer', transformer),
                          ('clf', KNeighborsClassifier(n_jobs=-1,
                                                **parameters['clf_type']['params']))])
    else: 
        raise KeyError('Unknown classifier: {}'.format(parameters['clf_type']['type']))
        
    return model

# target function

def objective(parameters, transformer, X_train, y_train, r_state):
    """
    Кросс-валидация с текущими гиперпараметрами
    
    :param params: гиперпараметры
    :transformer: column transformer для модели
    :X_train: матрица признаков
    :y_train: вектор меток объектов
    :r_state: random state
    :return: средняя точность на кросс-валидации
    """     
    model = buld_model(parameters, transformer, r_state)     
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=r_state)
    score = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc', n_jobs=-1)
    
    return {'loss': score.mean(), 'params': parameters['clf_type']['params'], 
              'clf':parameters['clf_type']['type'], 'status': STATUS_OK} 

In [6]:
# run hyperopt

trials = Trials()
best = fmin(fn=partial(objective, transformer=transformer, X_train=X, y_train=y, r_state=RS),
            space=search_space,
            algo=tpe.suggest,
            max_evals=40,
            trials=trials,
            rstate=np.random.RandomState(RS),
            show_progressbar=True
    )

100%|███████████████████████████████████████████████████████████████| 40/40 [06:11<00:00,  9.29s/trial, best loss: 0.5]


In [7]:
def df_results(hp_results):
    """
    Отображаем результаты hyperopt в формате DataFrame 
    
    :hp_results: результаты hyperop
    :return: pandas DataFrame
    """ 
    
    results = pd.DataFrame([{**x, **x['params']} for x in  hp_results])
    results.drop(labels=['status', 'params'], axis=1, inplace=True)
    results.sort_values(by=['loss'], ascending=False, inplace=True)
    return results

In [8]:
results = df_results(trials.results)
results.head(10)

Unnamed: 0,loss,clf,C,penalty,n_neighbors,p,max_depth,max_features
8,0.905384,lr,8.916779,l2,,,,
0,0.905366,lr,1.155279,l1,,,,
10,0.905342,lr,0.568117,l2,,,,
34,0.905313,lr,95.169858,l1,,,,
28,0.905312,lr,96.962013,l1,,,,
31,0.904961,lr,0.09834,l1,,,,
37,0.904931,lr,0.093308,l1,,,,
1,0.903989,lr,0.016913,l2,,,,
27,0.902545,lr,0.024347,l1,,,,
39,0.901522,lr,0.006829,l2,,,,
