In [None]:
#Load datasets

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import pandas as pd

train = pd.read_csv('../input/flavours-of-physics/training.csv.zip')
check_agreement = pd.read_csv('../input/flavours-of-physics/check_agreement.csv.zip')
check_correlation = pd.read_csv('../input/flavours-of-physics/check_correlation.csv.zip')    
test = pd.read_csv('../input/flavours-of-physics/test.csv.zip')

#Import libraries for timing and XGBoost

import time
import xgboost as xgb

In [None]:
#Import external library evaluation.py for KS and CvM tests

In [None]:
%%writefile evaluation.py

import numpy
from sklearn.metrics import roc_curve, auc


def __rolling_window(data, window_size):
    """
    Rolling window: take window with definite size through the array

    :param data: array-like
    :param window_size: size
    :return: the sequence of windows

    Example: data = array(1, 2, 3, 4, 5, 6), window_size = 4
        Then this function return array(array(1, 2, 3, 4), array(2, 3, 4, 5), array(3, 4, 5, 6))
    """
    shape = data.shape[:-1] + (data.shape[-1] - window_size + 1, window_size)
    strides = data.strides + (data.strides[-1],)
    return numpy.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)


def __cvm(subindices, total_events):
    """
    Compute Cramer-von Mises metric.
    Compared two distributions, where first is subset of second one.
    Assuming that second is ordered by ascending

    :param subindices: indices of events which will be associated with the first distribution
    :param total_events: count of events in the second distribution
    :return: cvm metric
    """
    target_distribution = numpy.arange(1, total_events + 1, dtype='float') / total_events
    subarray_distribution = numpy.cumsum(numpy.bincount(subindices, minlength=total_events), dtype='float')
    subarray_distribution /= 1.0 * subarray_distribution[-1]
    return numpy.mean((target_distribution - subarray_distribution) ** 2)


def compute_cvm(predictions, masses, n_neighbours=200, step=50):
    """
    Computing Cramer-von Mises (cvm) metric on background events: take average of cvms calculated for each mass bin.
    In each mass bin global prediction's cdf is compared to prediction's cdf in mass bin.

    :param predictions: array-like, predictions
    :param masses: array-like, in case of Kaggle tau23mu this is reconstructed mass
    :param n_neighbours: count of neighbours for event to define mass bin
    :param step: step through sorted mass-array to define next center of bin
    :return: average cvm value
    """
    predictions = numpy.array(predictions)
    masses = numpy.array(masses)
    assert len(predictions) == len(masses)

    # First, reorder by masses
    predictions = predictions[numpy.argsort(masses)]

    # Second, replace probabilities with order of probability among other events
    predictions = numpy.argsort(numpy.argsort(predictions, kind='mergesort'), kind='mergesort')

    # Now, each window forms a group, and we can compute contribution of each group to CvM
    cvms = []
    for window in __rolling_window(predictions, window_size=n_neighbours)[::step]:
        cvms.append(__cvm(subindices=window, total_events=len(predictions)))
    return numpy.mean(cvms)


def __roc_curve_splitted(data_zero, data_one, sample_weights_zero, sample_weights_one):
    """
    Compute roc curve

    :param data_zero: 0-labeled data
    :param data_one:  1-labeled data
    :param sample_weights_zero: weights for 0-labeled data
    :param sample_weights_one:  weights for 1-labeled data
    :return: roc curve
    """
    labels = [0] * len(data_zero) + [1] * len(data_one)
    weights = numpy.concatenate([sample_weights_zero, sample_weights_one])
    data_all = numpy.concatenate([data_zero, data_one])
    fpr, tpr, _ = roc_curve(labels, data_all, sample_weight=weights)
    return fpr, tpr


def compute_ks(data_prediction, mc_prediction, weights_data, weights_mc):
    """
    Compute Kolmogorov-Smirnov (ks) distance between real data predictions cdf and Monte Carlo one.

    :param data_prediction: array-like, real data predictions
    :param mc_prediction: array-like, Monte Carlo data predictions
    :param weights_data: array-like, real data weights
    :param weights_mc: array-like, Monte Carlo weights
    :return: ks value
    """
    assert len(data_prediction) == len(weights_data), 'Data length and weight one must be the same'
    assert len(mc_prediction) == len(weights_mc), 'Data length and weight one must be the same'

    data_prediction, mc_prediction = numpy.array(data_prediction), numpy.array(mc_prediction)
    weights_data, weights_mc = numpy.array(weights_data), numpy.array(weights_mc)

    assert numpy.all(data_prediction >= 0.) and numpy.all(data_prediction <= 1.), 'Data predictions are out of range [0, 1]'
    assert numpy.all(mc_prediction >= 0.) and numpy.all(mc_prediction <= 1.), 'MC predictions are out of range [0, 1]'

    weights_data /= numpy.sum(weights_data)
    weights_mc /= numpy.sum(weights_mc)

    fpr, tpr = __roc_curve_splitted(data_prediction, mc_prediction, weights_data, weights_mc)

    Dnm = numpy.max(numpy.abs(fpr - tpr))
    return Dnm


def roc_auc_truncated(labels, predictions, tpr_thresholds=(0.2, 0.4, 0.6, 0.8),
                      roc_weights=(4, 3, 2, 1, 0)):
    """
    Compute weighted area under ROC curve.

    :param labels: array-like, true labels
    :param predictions: array-like, predictions
    :param tpr_thresholds: array-like, true positive rate thresholds delimiting the ROC segments
    :param roc_weights: array-like, weights for true positive rate segments
    :return: weighted AUC
    """
    assert numpy.all(predictions >= 0.) and numpy.all(predictions <= 1.), 'Data predictions are out of range [0, 1]'
    assert len(tpr_thresholds) + 1 == len(roc_weights), 'Incompatible lengths of thresholds and weights'
    fpr, tpr, _ = roc_curve(labels, predictions)
    area = 0.
    tpr_thresholds = [0.] + list(tpr_thresholds) + [1.]
    for index in range(1, len(tpr_thresholds)):
        tpr_cut = numpy.minimum(tpr, tpr_thresholds[index])
        tpr_previous = numpy.minimum(tpr, tpr_thresholds[index - 1])
        area += roc_weights[index - 1] * (auc(fpr, tpr_cut, reorder=True) - auc(fpr, tpr_previous, reorder=True))
    tpr_thresholds = numpy.array(tpr_thresholds)
    # roc auc normalization to be 1 for an ideal classifier
    area /= numpy.sum((tpr_thresholds[1:] - tpr_thresholds[:-1]) * numpy.array(roc_weights))
    return area

In [None]:
import evaluation

In [None]:
#Data preselection

train_pre = train.drop(train[train.min_ANNmuon <= 0.4].index)

In [None]:
#Create features and labels tables

train_features = train_pre.drop(['min_ANNmuon', 'mass', 'production', 'signal', 'id', 'SPDhits'], axis = 1)
train_labels = train_pre['signal']

In [None]:
pd.set_option('display.max_columns', None)
train_features.describe()

In [None]:
#Create model and fit to data

t0 = time.perf_counter()

params = {"objective": "binary:logistic",
          "eta": 0.4,
          "max_depth": 6,
          "min_child_weight": 3,
          "subsample": 0.5,
          "colsample_bytree": 0.7,
          "seed": 2}
n_trees = 300
model = xgb.train(params, xgb.DMatrix(train_features, train_labels), n_trees)

t1 = time.perf_counter() - t0
print('Timing: ', t1, ' s')

In [None]:
#Create datasets for agreement and correlation check

check_agreement_features = check_agreement.drop(['signal', 'id', 'weight', 'SPDhits'], axis = 1)
check_agreement_labels = check_agreement['signal']
check_agreement_weigths = check_agreement['weight']

check_correlation_features = check_correlation.drop(['id', 'mass', 'SPDhits'], axis = 1)
check_correlation_labels = check_correlation['mass']

In [None]:
#Predictions on agreement and correlation datasets

agreement_probs = model.predict(xgb.DMatrix(check_agreement_features)).reshape(-1,)
correlation_probs = model.predict(xgb.DMatrix(check_correlation_features)).reshape(-1,)

In [None]:
#Evaluate KS score, it has to be less than 0.09

ks = evaluation.compute_ks(
agreement_probs[check_agreement['signal'].values == 0],
agreement_probs[check_agreement['signal'].values == 1],
check_agreement[check_agreement['signal'] == 0]['weight'].values,
check_agreement[check_agreement['signal'] == 1]['weight'].values)
print ('KS metric', ks, ks < 0.09)

In [None]:
#Evaluate CvM score, it has to be less than 0.002

cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
print ('CvM metric', cvm, cvm < 0.002)

In [None]:
#If the two tests have been passed import test dataset, create the feature table and 
#make predictions on test dataset and export them as .csv file
#Not every run gives True conditions both for cvm and ks

if cvm < 0.002 and ks < 0.09 :

    test_features = test.drop(['id', 'SPDhits'], axis = 1)
    test_probs = model.predict(xgb.DMatrix(test_features)).reshape(-1,)
    submission = pd.DataFrame({"id": test["id"], "prediction": test_probs})
    submission.to_csv("submission.csv", index=False)

In [None]:
#Plot ROC curve

import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
preds = model.predict(xgb.DMatrix(check_agreement_features))
#preds = probs[:,0]

fpr, tpr, threshold = metrics.roc_curve(check_agreement_labels, preds)
roc_auc = metrics.auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.style.use('default')
plt.title('ROC curve')
plt.plot(tpr, 1-fpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [1, 0],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('Signal efficiency (TPR)')
plt.ylabel('Background rejection (1-FPR)')
plt.show()

In [None]:
#Create signal/background tables for histograms + predictions

test_1 = train_pre.loc[train_pre['signal'] == 1]
test_0 = train_pre.loc[train_pre['signal'] == 0]
test_1 = test_1.drop(['min_ANNmuon', 'mass', 'production', 'signal', 'id', 'SPDhits'], axis = 1)
test_0 = test_0.drop(['min_ANNmuon', 'mass', 'production', 'signal', 'id', 'SPDhits'], axis = 1)

test_1_probs = model.predict(xgb.DMatrix(test_1)).reshape(-1,)
test_0_probs = model.predict(xgb.DMatrix(test_0)).reshape(-1,)

In [None]:
#Histograms for signal and background

plt.yscale('symlog')
plt.ylim(10,3_0000)
plt.hist(test_1_probs, bins = 15)
plt.hist(test_0_probs, bins = 15, histtype='step')
plt.show()

In [None]:
graph = train.drop(['min_ANNmuon', 'mass', 'production', 'id', 'SPDhits'], axis = 1)

In [None]:
graph.columns

In [None]:
#Histograms for signal and background variables

plt.yscale('symlog')
plt.ylim(10,3_0000)
for column in test_1.columns:
    plt.hist(test_1[column], bins = 15, label=('Signal '+ column))
    plt.hist(test_0[column], bins = 15, histtype='step', label=('Background '+ column))
    plt.legend(loc="upper left")
    plt.show()