In [4]:
from data_preprocessing import *
from main import xgb_classifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [5]:
param_grid = {
    "max_depth": [1, 3, 5, 7, 10],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0.25, 0.5, 1],
    "reg_lambda": [0.5, 1, 1.5, 2, 2.5],
    "scale_pos_weight": [3, 4, 5],
    "subsample": [0.4, 0.8, 1],
    "colsample_bytree": [0.1, 0.5, 0.7],
    }

In [6]:
def find_best_params(X, y, param_grid):
    xgb_cl = xgb.XGBClassifier()
    grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="accuracy")
    grid_cv.fit(X, y)
    print(grid_cv.best_score_)
    return grid_cv.best_params_


In [1]:
def run(data, label_col):
    raw_data = load_datasets(data)
    processing = data_preprocessing(raw_data)
    X_train, X_test, y_train, y_test = processing.replace_missing_values().split_to_X_and_y(label_col).transform_X_and_y().split_train_test()
    
    best_params = find_best_params(X_train, y_train, param_grid)

    xgb_cl = xgb.XGBClassifier(**best_params)
    classifier = xgb_classifier(xgb_cl)
    classifier.train(X_train, y_train)
    preds_test = classifier.test(X_test)
    preds_train = classifier.test(X_train)
    
    print("train acc for ", data, ":",classifier.accuracy(y_train, preds_train))
    print("test acc for ", data, ":", classifier.accuracy(y_test, preds_test))

In [7]:
# run for hepatitis dataset:
run("hepatitis", "0")



0.8630229419703105
train acc for  hepatitis : 0.9310344827586207
test acc for  hepatitis : 0.7692307692307693


In [8]:
# run for diabetes dataset:
run("diabetes", "Class")

0.681358078461737
train acc for  diabetes : 1.0
test acc for  diabetes : 0.65625
