In [1]:
import numpy as np
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search
from xgboost import XGBClassifier

In [2]:
def data_split(x, y, ratio):
    rows_number = y.shape[0]
    
    bound = int(round(rows_number*ratio**2))

    return x[0:bound,:] , x[bound:,:] , y[0:bound], y[bound:]

In [3]:
def performance_metrics(pred_labels, true_labels):

    #Positive Cases
    PC = np.sum(true_labels == 1) 
    
    #Negative Cases
    NC = np.sum(true_labels == 0)
    
    # True Positive 
    TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1))
 
    # True Negative 
    TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0))
 
    # False Positive 
    FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0))
 
    # False Negative 
    FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1))
    
    #Accuracy
    Accuracy = (TP+TN)/float(TP+FP+TN+FN)
    
    #Precision
    Precision = (TP)/float(TP+FP)
    
    #Recall
    Recall = (TP)/float(TP+FN)
    
    fpr, tpr, thresholds = metrics.roc_curve(true_labels, pred_labels)
    AUC = metrics.auc(fpr, tpr)
    
    print 'Positive Cases: %d, True Positive: %d, False Positive: %d' % (PC, TP, FP)
    print 'Negative Cases: %d, True Negative: %d, False Negative: %d' % (NC, TN, FN)
    print 'Accuracy: %.3f, Precision: %.3f, Recall: %.3f, AUC: %.3f' % (Accuracy, Precision, Recall, AUC)

In [4]:
ratio = 0.85


x = np.load('data/features_pca.npy').T
y = np.load('data/labels.npy')

x_train, x_test, y_train, y_test = data_split(x,y,ratio)                  


In [6]:
xgb = XGBClassifier( learning_rate =0.01,n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=1, reg_alpha=0.005)
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)

In [7]:
performance_metrics(y_pred, y_test)

Positive Cases: 3198, True Positive: 119, False Positive: 129
Negative Cases: 16227, True Negative: 16098, False Negative: 3079
Accuracy: 0.835, Precision: 0.480, Recall: 0.037, AUC: 0.515


In [None]:
params = {
 'min_child_weight':range(1,6,2),
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
 

}

gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch.fit(x_train, y_train, eval_metric="logloss", eval_set=[(x_val, y_val)])
y_pred = gsearch.predict(x_test)

print "Best Parameters: " + str(gsearch.best_params_)

In [None]:
performance_metrics(y_pred, y_test)