In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                'marital_status', 'occupation', 'relationship', 'race', 
                'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 
                'native_country', 'wage_class']

data = pd.read_csv('data/adult.data', 
                    sep = ",",
                    skipinitialspace=True,  
                    names = column_names).iloc[::3]

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
12,23,Private,122272,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
X = data.drop(["fnlwgt", "education", "wage_class"], axis=1)
X["immigrant"] = (X.native_country == "United-States").astype(int)
X = X.drop(["native_country"], axis=1)
X = pd.get_dummies(X)
X.shape

(10854, 50)

In [5]:
y = (data.wage_class == ">50K").astype(int)

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                    random_state=42)

In [8]:
from xgboost.sklearn import XGBClassifier

In [9]:
from sklearn.model_selection import RandomizedSearchCV
?RandomizedSearchCV

In [10]:
from scipy.stats import uniform, randint

In [11]:
param_dists = {"n_estimators": randint(20, 100), # liczba calkowita z [20, 100]
               "learning_rate": uniform(0.01, 0.3), # zakres [0.01, 0.31]
               "max_depth": [5,6,7],  # losowa (jednostajnie) wartosc z listy
               "min_child_weight": randint(1, 10),
               "subsample": [0.7, 0.5, 0.3],
               "colsample_bytree": [0.7, 0.5, 0.3],
               "gamma": uniform(0.01, 0.2),
               "reg_lambda": uniform(0.01, 0.2)
               }


In [12]:
model = XGBClassifier()
optimizer = RandomizedSearchCV(model, param_dists, 
                               n_iter=20, 
                               scoring="roc_auc", 
                               cv=3)
optimizer.fit(X_train, y_train)





RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa228219310>,
                                        'max_depth':

In [13]:
optimizer.cv_results_["params"]

[{'colsample_bytree': 0.7,
  'gamma': 0.1491700995250253,
  'learning_rate': 0.06879220971662639,
  'max_depth': 7,
  'min_child_weight': 6,
  'n_estimators': 88,
  'reg_lambda': 0.1169789722877478,
  'subsample': 0.7},
 {'colsample_bytree': 0.3,
  'gamma': 0.18290713957656893,
  'learning_rate': 0.05924442053700988,
  'max_depth': 5,
  'min_child_weight': 1,
  'n_estimators': 48,
  'reg_lambda': 0.08992823195482595,
  'subsample': 0.7},
 {'colsample_bytree': 0.7,
  'gamma': 0.023640190542817133,
  'learning_rate': 0.21487878746051778,
  'max_depth': 7,
  'min_child_weight': 4,
  'n_estimators': 36,
  'reg_lambda': 0.12072941987718716,
  'subsample': 0.3},
 {'colsample_bytree': 0.3,
  'gamma': 0.04708443127386561,
  'learning_rate': 0.06227904243308058,
  'max_depth': 6,
  'min_child_weight': 6,
  'n_estimators': 94,
  'reg_lambda': 0.17454354654949725,
  'subsample': 0.5},
 {'colsample_bytree': 0.5,
  'gamma': 0.03652695447453246,
  'learning_rate': 0.0291260696132888,
  'max_depth': 

In [14]:
optimizer.best_params_

{'colsample_bytree': 0.7,
 'gamma': 0.1630383035454983,
 'learning_rate': 0.15322024868067557,
 'max_depth': 5,
 'min_child_weight': 4,
 'n_estimators': 89,
 'reg_lambda': 0.02282428549388657,
 'subsample': 0.7}

Część walidacyjna zamiast kroswalidacji.

In [15]:
from sklearn.model_selection import ShuffleSplit

x = [1,2,3,4,5]
cv = ShuffleSplit(1, test_size=0.4)
list(cv.split(x))

[(array([3, 1, 0]), array([2, 4]))]

In [16]:
param_dists = {"n_estimators": randint(10, 20), 
               "learning_rate": uniform(0.01, 0.3),
               "max_depth": [5,6,7],
               "min_child_weight": randint(1, 10),
               "subsample": [0.7, 0.5, 0.3],
               "colsample_bytree": [0.7, 0.5, 0.3],
               "gamma": uniform(0.01, 0.2),
               "reg_lambda": uniform(0.01, 0.2)
               }
model = XGBClassifier()

In [None]:
cv = ShuffleSplit(1, test_size=0.1)

optimizer = RandomizedSearchCV(model, param_dists, 
                               n_iter=2, scoring="roc_auc", 
                               cv=cv)
optimizer.fit(X_train, y_train)