In [1]:
import os
import h5py
import json
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
def getFiles(path):
    files = os.listdir(path)
    size = len(files)
    return [files, size]

In [3]:
def model(dataX, dataY, param):
    #xGboost training
    dtrain = xgb.DMatrix(dataX, label=dataY)

    num_round = 10
    bst = xgb.train(param, dtrain, num_round)
    
    return bst

In [4]:
def accuracy(model, dataX, labels):
    pre = model.predict(xgb.DMatrix(dataX))
    j = 0
    for i in range(len(labels)):
        if labels[i] == pre[i]:
            j+=1
    return 100 * j / len(labels)

In [5]:
def grid_search(dataX, dataY, validation_train_index, validation_test_index, num_class):
    param = dict()
    MAX_ACC = 0
    best_params = None
    
    numOffold_inner = validation_train_index.shape[1]
    
    trainAcc = np.zeros(numOffold_inner)
    testAcc = np.zeros(numOffold_inner)
    
    trainAcc_best = 0
    testAcc_best = 0
    
    for eta in options["eta"]:
        for gamma in options["gamma"]:
            for max_depth in options["max_depth"]:
                for min_child_weight in options["min_child_weight"]:
                    for subsample in options["subsample"]:
                        param["eta"] = eta
                        param["gamma"] = gamma
                        param["max_depth"] = max_depth
                        param["min_child_weight"] = min_child_weight
                        param["subsample"] = subsample
                        param['nthread'] = 4
                        param['eval_metric'] = 'auc'
                        param['num_class'] = num_class
                        
                        #Inner Cross Validation
                        for num_CV_inner in range(numOffold_inner):
                            dataX_train = dataX[validation_train_index[:, num_CV_inner]]
                            dataY_train = dataY[validation_train_index[:, num_CV_inner]]
                            dataX_test = dataX[validation_test_index[:, num_CV_inner]]
                            dataY_test = dataY[validation_test_index[:, num_CV_inner]]
                            
                            bst = model(dataX_train, dataY_train, param)
                            
                            trainAcc[num_CV_inner] = accuracy(bst, dataX_train, dataY_train)
                            testAcc[num_CV_inner] = accuracy(bst, dataX_test, dataY_test)

                        TMP_ACC = np.mean(testAcc)

                        if TMP_ACC > MAX_ACC:
                            MAX_ACC = TMP_ACC
                            best_params = param
                            trainAcc_best = np.mean(trainAcc)
                            testAcc_best = np.mean(testAcc)
    return [best_params, trainAcc_best, testAcc_best]

In [6]:
method = 'xgboost'
dataset_path = "D:/project-btp/codes/Shared_Datasets/"
path1 = "D:\\project-btp\\results\\" + method + "\\"

if not os.path.isdir(path1):
    os.mkdir(path1)

[name, num] = getFiles(dataset_path)

In [8]:
"""
booster: gbtree, gblinear, dart
eta: [0;1, 0.1 stp]
gamma = [0:0.5, 0.1 step]
max_depth = [ 3, 4, 5, 6, 8, 10, 12, 15]
"min_child_weight" : [ 1, 3, 5, 7 ],
subsample = [0:1, 0.5]
objective is also decidable
"""
options = {
    "eta" : np.arange(0.1, 0.4, 0.1),
    "gamma" : np.arange(0.1, 0.4, 0.1),
    "max_depth" : [ 5, 10],
    "min_child_weight" : [ 1, 5 ],
    "subsample" : [0.5, 1]
}

df = pd.DataFrame(columns = ["dataset_name", "training_accuracy", "testing_accuracy"])

for dataset_num in range(num):
    dataset_name = name[dataset_num]
    print(dataset_name)
    
    #Check for repitition
    """to do"""
    
    dataset_path_name = dataset_path + dataset_name + '\\'
    
    #loading dataset
    folds = None
    labels = None
    validation_train = None
    validation_test = None
    
    with h5py.File(dataset_path_name + "folds.mat", "r") as f:
        folds = np.array(f.get("folds")).T
    with h5py.File(dataset_path_name + "labels.mat", "r") as f:
        labels = np.array(f.get("labels")).T
    with h5py.File(dataset_path_name + "validation_train.mat", "r") as f:
        validation_train = np.array(f.get("validation_train")).T
    with h5py.File(dataset_path_name + "validation_test.mat", "r") as f:
        validation_test = np.array(f.get("validation_test")).T
    
    numOffold = None
    try:
        with h5py.File(dataset_path_name + "numOffold.mat", "r") as f:
            numOffold = int(np.array(f.get("numOffold")).item())
    except:
        numOffold = folds.shape[1] #4-fold CV
    
    with h5py.File(dataset_path_name + dataset_name + ".mat", "r") as f:
        dataX = np.array(f.get(dataset_name)).T
    dataY = labels
    
    folds = (folds != 0)
    
    validation_test = (validation_test != 0)
    validation_train = (validation_train != 0)
    
    U_dataY = np.unique(dataY)
    nclass = len(U_dataY)
    
    
    #results which will be saved
    trainAcc = [0] * numOffold
    testAcc = [0] * numOffold
    
    options_saved = [0] * numOffold
    val_trainAcc_All = [0] * numOffold
    val_testAcc_All = [0] * numOffold
    
    #Outer Cross-validation
    for num_CV in range(numOffold):
        print(num_CV+1, "th fold - ", end = "")
        
        test_index = folds[:, num_CV]
        train_index = [not x for x in test_index]
        
        validation_train_index = validation_train[:, num_CV * numOffold: (num_CV + 1) * numOffold]
        validation_test_index = validation_test[:, num_CV * numOffold: (num_CV + 1) * numOffold]
        
        [params, val_trainAcc, val_testAcc] = grid_search(dataX, dataY, validation_train_index, validation_test_index, nclass)
        
        bst = model(dataX[train_index], dataY[train_index], params)
        
        #saving results
        val_trainAcc_All[num_CV] = val_trainAcc
        val_testAcc_All[num_CV] = val_testAcc
        options_saved[num_CV] = params
        trainAcc[num_CV] = accuracy(bst, dataX[train_index], labels[train_index])
        testAcc[num_CV] = accuracy(bst, dataX[test_index], labels[test_index])
        print("trainAcc: ", trainAcc[num_CV], "testAcc: ", testAcc[num_CV])
    
    print("Final Training data accuracy: ", np.mean(trainAcc))
    print("Final Testing data accuracy: ", np.mean(testAcc))

    results = {
        "validation_trainAcc" : val_trainAcc_All,
        "validation_testAcc" : val_testAcc_All,
        "options_saved" : options_saved,
        "trainAcc" : trainAcc,
        "testAcc" : testAcc
    }
    
    #Saving results in json
    filename = path1 + "Res_" + dataset_name + ".json"
    with open(filename, "w") as f:
        json.dump(results, f)
    
    df = df.append({"dataset_name" : dataset_name,
               "training_accuracy" : round(np.mean(trainAcc), 2),
               "testing_accuracy" :  round(np.mean(testAcc), 2)},
              ignore_index = True)

#Final results in Excel Sheet
df.to_excel(path1 + "fixed_all_results.xlsx")

abalone
1 th fold - trainAcc:  87.99872326843281 testAcc:  64.55938697318008
2 th fold - trainAcc:  88.66900734120651 testAcc:  63.69731800766284
3 th fold - trainAcc:  88.73284391956591 testAcc:  62.93103448275862
4 th fold - trainAcc:  88.19023300351101 testAcc:  65.51724137931035
Final Training data accuracy:  88.39770188317905
Final Testing data accuracy:  64.17624521072797
adult
1 th fold - trainAcc:  88.98989588771843 testAcc:  86.46889011731466
Final Training data accuracy:  88.98989588771843
Final Testing data accuracy:  86.46889011731466
car
1 th fold - trainAcc:  98.8425925925926 testAcc:  97.91666666666667
2 th fold - trainAcc:  98.99691358024691 testAcc:  96.99074074074075
3 th fold - trainAcc:  99.45987654320987 testAcc:  97.68518518518519
4 th fold - trainAcc:  98.99691358024691 testAcc:  97.45370370370371
Final Training data accuracy:  99.07407407407408
Final Testing data accuracy:  97.51157407407408
iris
1 th fold - trainAcc:  97.34513274336283 testAcc:  97.297297297297

In [70]:
 """
     Commments for sanity check
     To do or not to do
     
    #0-1 encoding for target
    for i in range(nclass):
        idx = (dataY==U_dataY[i])
        for j in range(len(idx)):
            if idx[j]:
                dataY_temp[j][i] = 1
    
    dataset_path = "D:\\project-btp\\codes\\data_csv\\"
    path1 = "D:\\project-btp\\" + method + "\\"
    
    folds = np.genfromtxt(dataset_path_name + 'folds.csv', delimiter=',')
    labels = np.genfromtxt(dataset_path_name + 'labels.csv', delimiter=',')
    validation_train = np.genfromtxt(dataset_path_name + 'validation_train.csv', delimiter=',')
    validation_test = np.genfromtxt(dataset_path_name + 'validation_test.csv', delimiter=',')
    numOffold = np.genfromtxt(dataset_path_name + 'numOffolds.csv', delimiter=',')
    dataX = np.genfromtxt(dataset_path_name + dataset_name + ".csv", delimiter=',')
    
    eta = np.arange(0.1,    gamma = np.arange(0, 0.3, 0.1)
    max_depth = [ 5, 10]
    min_child_weight = [ 1, 5 ]
    subsample = [0.5, 1]
    i = 0
    for e in eta:
        for g in gamma:
            for depth in max_depth:
                for mcw in min_child_weight:
                    for ss in subsample:
                        i+=1 0.3, 0.1)

    """

array([0., 0., 1., ..., 1., 1., 2.])