# Outline

**Objective** - To search Physics beyond Standard Model. <br><br>
Standard Model was complete with the discovery of Higgs Boson. But that model is still incomplete, as it is unable to explain certain phenomenon like Dark Matter, gravity etc.. <br><br>

**Lepton Flavor** (e<sup>-</sup>, u<sup>-</sup>, t<sup>-</sup>, and the corresponding neutrinos) is a conserved quantity in Standard Model. But the rare decay addressed in this particular problem t->3u violates it.<br><br>

This problem is focussed on finding this rare decay, which could pave the way for Physics beyond Standard Model

The dataset is divided among following files-

* **training.csv** - It is the dataset consisting of real data and simulated data
* **test.csv** - Unseen data which is to be used to check our results
* **check_agreement.csv** -     data with a control channel, Ds → φ(2u)π, which has a similar topology as of t(tau)->3u(muons). This test is necessary to avoid bias of classifier on the simulated data
* **check_correlation.csv** -   this is to check whether classifier is not too correlated with the mass, so as to avoid false peaks. Also as shown later, that mass will separate background and signal almost perfectly

# Importing Libraries

In [None]:
import numpy
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
from hep_ml.gradientboosting import UGradientBoostingClassifier
from hep_ml.losses import BinFlatnessLossFunction

**Basic Utility functions**

In [None]:
from sklearn.metrics import roc_curve, auc


def __rolling_window(data, window_size):
    """
    Rolling window: take window with definite size through the array
    :param data: array-like
    :param window_size: size
    :return: the sequence of windows
    Example: data = array(1, 2, 3, 4, 5, 6), window_size = 4
        Then this function return array(array(1, 2, 3, 4), array(2, 3, 4, 5), array(3, 4, 5, 6))
    """
    shape = data.shape[:-1] + (data.shape[-1] - window_size + 1, window_size)
    strides = data.strides + (data.strides[-1],)
    return numpy.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)


def __cvm(subindices, total_events):
    """
    Compute Cramer-von Mises metric.
    Compared two distributions, where first is subset of second one.
    Assuming that second is ordered by ascending
    :param subindices: indices of events which will be associated with the first distribution
    :param total_events: count of events in the second distribution
    :return: cvm metric
    """
    target_distribution = numpy.arange(1, total_events + 1, dtype='float') / total_events
    subarray_distribution = numpy.cumsum(numpy.bincount(subindices, minlength=total_events), dtype='float')
    subarray_distribution /= 1.0 * subarray_distribution[-1]
    return numpy.mean((target_distribution - subarray_distribution) ** 2)


def compute_cvm(predictions, masses, n_neighbours=200, step=50):
    """
    Computing Cramer-von Mises (cvm) metric on background events: take average of cvms calculated for each mass bin.
    In each mass bin global prediction's cdf is compared to prediction's cdf in mass bin.
    :param predictions: array-like, predictions
    :param masses: array-like, in case of Kaggle tau23mu this is reconstructed mass
    :param n_neighbours: count of neighbours for event to define mass bin
    :param step: step through sorted mass-array to define next center of bin
    :return: average cvm value
    """
    predictions = numpy.array(predictions)
    masses = numpy.array(masses)
    assert len(predictions) == len(masses)

    # First, reorder by masses
    predictions = predictions[numpy.argsort(masses)]

    # Second, replace probabilities with order of probability among other events
    predictions = numpy.argsort(numpy.argsort(predictions, kind='mergesort'), kind='mergesort')

    # Now, each window forms a group, and we can compute contribution of each group to CvM
    cvms = []
    for window in __rolling_window(predictions, window_size=n_neighbours)[::step]:
        cvms.append(__cvm(subindices=window, total_events=len(predictions)))
    return numpy.mean(cvms)


def __roc_curve_splitted(data_zero, data_one, sample_weights_zero, sample_weights_one):
    """
    Compute roc curve
    :param data_zero: 0-labeled data
    :param data_one:  1-labeled data
    :param sample_weights_zero: weights for 0-labeled data
    :param sample_weights_one:  weights for 1-labeled data
    :return: roc curve
    """
    labels = [0] * len(data_zero) + [1] * len(data_one)
    weights = numpy.concatenate([sample_weights_zero, sample_weights_one])
    data_all = numpy.concatenate([data_zero, data_one])
    fpr, tpr, _ = roc_curve(labels, data_all, sample_weight=weights)
    return fpr, tpr


def compute_ks(data_prediction, mc_prediction, weights_data, weights_mc):
    """
    Compute Kolmogorov-Smirnov (ks) distance between real data predictions cdf and Monte Carlo one.
    :param data_prediction: array-like, real data predictions
    :param mc_prediction: array-like, Monte Carlo data predictions
    :param weights_data: array-like, real data weights
    :param weights_mc: array-like, Monte Carlo weights
    :return: ks value
    """
    assert len(data_prediction) == len(weights_data), 'Data length and weight one must be the same'
    assert len(mc_prediction) == len(weights_mc), 'Data length and weight one must be the same'

    data_prediction, mc_prediction = numpy.array(data_prediction), numpy.array(mc_prediction)
    weights_data, weights_mc = numpy.array(weights_data), numpy.array(weights_mc)

    assert numpy.all(data_prediction >= 0.) and numpy.all(data_prediction <= 1.), 'Data predictions are out of range [0, 1]'
    assert numpy.all(mc_prediction >= 0.) and numpy.all(mc_prediction <= 1.), 'MC predictions are out of range [0, 1]'

    weights_data /= numpy.sum(weights_data)
    weights_mc /= numpy.sum(weights_mc)

    fpr, tpr = __roc_curve_splitted(data_prediction, mc_prediction, weights_data, weights_mc)

    Dnm = numpy.max(numpy.abs(fpr - tpr))
    return Dnm


def roc_auc_truncated(labels, predictions, tpr_thresholds=(0.2, 0.4, 0.6, 0.8),
                      roc_weights=(4, 3, 2, 1, 0)):
    """
    Compute weighted area under ROC curve.
    :param labels: array-like, true labels
    :param predictions: array-like, predictions
    :param tpr_thresholds: array-like, true positive rate thresholds delimiting the ROC segments
    :param roc_weights: array-like, weights for true positive rate segments
    :return: weighted AUC
    """
    assert numpy.all(predictions >= 0.) and numpy.all(predictions <= 1.), 'Data predictions are out of range [0, 1]'
    assert len(tpr_thresholds) + 1 == len(roc_weights), 'Incompatible lengths of thresholds and weights'
    fpr, tpr, _ = roc_curve(labels, predictions)
    area = 0.
    tpr_thresholds = [0.] + list(tpr_thresholds) + [1.]
    for index in range(1, len(tpr_thresholds)):
        tpr_cut = numpy.minimum(tpr, tpr_thresholds[index])
        tpr_previous = numpy.minimum(tpr, tpr_thresholds[index - 1])
        area += roc_weights[index - 1] * (auc(fpr, tpr_cut, reorder=True) - auc(fpr, tpr_previous, reorder=True))
    tpr_thresholds = numpy.array(tpr_thresholds)
    # roc auc normalization to be 1 for an ideal classifier
    area /= numpy.sum((tpr_thresholds[1:] - tpr_thresholds[:-1]) * numpy.array(roc_weights))
    return area

# M3Body Event Selection

**Event Selection** <br><br>

Tau particles are the heavier brothers of electron, with mass of aroung 1.78GeV/c<sup>2</sup>. M3Body selection defines the geometric and kinematic properties of the 3 track system which helps in identifying the rare decay. Some good tips regarding event selection are mentioned in the [paper](https://arxiv.org/pdf/1409.8548.pdf)

1. Tau leptons usually originate from a heavy quark decay, which are formed via proton-proton collision.
2. Candidates are selected via 3 tracks, which form a vertex with particle mass (around 1780Mev/c<sup>2</sup> aka mass of tau particle), and also which is displaced siginificatnly from PV

The point where proton-proton collision occurs is called Primary Vertex(PV) and the point where tau particle will decay into 3 particles is called secondary vertex. Since the 3 particle decay happens at Secondary vertex, there must be a significant distance between tracks and the PV.

3. Due to low Q-values the angle between momentum vector and collision-decay vertex is small, and is almost collinear. So dira will be having quite high value

The paper uses 2 classifiers M3Body(using the geometric properties to form a vertex of 3 tracks), and MPID(particle identification of Muon). But here in kaggle we are focussed on M3Body classification only. Instead of using MPID, we are given feature 'minANN_muon' which provides us with probability of particle being a Muon




# Data Analysis

In [None]:
train= pd.read_csv("../input/flavours-of-physics-kernels-only/training.csv")
signal = train[train.signal==1]
background = train[train.signal==0]
plt.rcParams["figure.figsize"] = (8, 5)

Dira is the angle between momentum vector of 3 particle system, and the line joining PV(p-p collision vertex) and DV(Decay Vertex). Its value is almost collinear due to low loss during decay

In [None]:
plt.hist(signal.dira, range=(0.997,1), bins=100, label='Signal', alpha=0.7,normed=True)
plt.hist(background.dira, range=(0.997,1), bins=100, label='Background', alpha=0.7,normed=True)
plt.xlabel('Dira')
plt.legend()
plt.show()


VertexChiSquare signifies the vertex fit quality from the three particle decay body. Lower the value, better the fit. Since t->3u decay has a better vertex fit, it is a high-valued feature in our classifier

In [None]:
plt.hist(signal.VertexChi2, range=(0,16), bins=100, label='Signal', alpha=0.7,normed=True)
plt.hist(background.VertexChi2,range=(0,16),bins=100, label='Background',alpha=0.7,normed=True)
plt.xlabel(r'Vertex $\chi^2$')
plt.ylabel('Normalized Fraction')
plt.legend()
plt.show()

IPSig seems to be a highly valued feature. It signifies the closest distance between PV and the track

In [None]:
plt.hist(signal.IPSig, range=(0,25), bins=100, label='Signal', alpha=0.7,normed=True)
plt.hist(background.IPSig,range=(0,25),bins=100, label='Background',alpha=0.7,normed=True)
plt.xlabel('IPSig')
plt.ylabel('Normalized Fraction')
plt.legend()
plt.show()

Mass is a feature which can almost perfectly distinguish between signal and background. Signal is a peak concenterated around the mass of tau particle(1780Mev/c<sup>2</sup>).<br><br>
And background is constructed by removing the actual data around the mass region. These observations are clearly visible in the below mentioned graphs

In [None]:
plt.rcParams["figure.figsize"] = (10, 3)
fig, (ax1,ax2)= plt.subplots(1,2,sharey=False)
ax1.legend("Background")
train['mass'][train['signal']==0].hist(bins=100,ax=ax1)
ax1.plot()
ax2.legend("Signal")
train['mass'][train['signal']==1].hist(bins=100,ax=ax2)
ax2.plot()
plt.legend()
plt.show()

*min_ANNmuon* This feature is the result of using ANN for particle identification. Here it is completely visible that high probability of this feature correclty coreesponds to the high signal. <br><br>
This feature is not to be used in the classifier training

In [None]:
lowMuonSignal= train[train['min_ANNmuon']<=0.4]['signal'].sum() / train[train['min_ANNmuon']<=0.4].shape[0]
highMuonSignal= train[train['min_ANNmuon']>0.4]['signal'].sum() / train[train['min_ANNmuon']>0.4].shape[0]

print(f"The signal-to-background ratio having muon probability less than 0.4 is {lowMuonSignal*100}%")
print(f"The signal-to-background ratio having muon probability higher than 0.4 is {highMuonSignal*100}%")

# Validation Set

As recommended, *min_ANNmuon>0.4* is to be used for evaluating score. So we have created validation set satisfying the test set criteria 

In [None]:
def getTrnValPair(train_data):
    tr,vl=train_test_split( train_data[train_data['min_ANNmuon']>0.4], test_size=0.10, random_state=42)
    training_set= pd.concat([ train_data[train_data['min_ANNmuon']<=0.4],tr ])
    return training_set,vl

# Kinematic Features

In [None]:
def addKinematicFeatures(df):
    p0_pz = (df.p0_p**2 - df.p0_pt**2)**0.5
    p1_pz = (df.p1_p**2 - df.p1_pt**2)**0.5
    p2_pz = (df.p2_p**2 - df.p2_pt**2)**0.5
    df['pz'] = p0_pz + p1_pz + p2_pz
    df['p'] = (df.pt**2 + df.pz**2)**0.5
    df['NEW_FD_SUMP']=df['FlightDistance']/(df['p0_p']+df['p1_p']+df['p2_p'])
    df['vel']= (df['FlightDistance'])/df['LifeTime']
    return df

# Geometric Features

In [None]:
def addGemetricFeatures(df):
    df['Flight_Dist_Sig']= df['FlightDistance']/df['FlightDistanceError']
    df['iso_diff']=((df['isolationa']+df['isolationb']+df['isolationc'])/3+
                 (df['isolationd']+df['isolatione']+df['isolationf'])/3)/2-df['iso']
    df['DOC_max']= df.loc[:,['DOCAone','DOCAtwo','DOCAthree']].max(axis=1)
    df['iso_bdt_p_min']= df.loc[:,['p0_IsoBDT','p1_IsoBDT','p2_IsoBDT']].min(axis=1)
    df['p_track_Chi2Dof_MAX'] = df.loc[:, ['p0_track_Chi2Dof', 'p1_track_Chi2Dof', 'p2_track_Chi2Dof']].max(axis=1)
    df['CDF_sum'] = df['CDF1']+df['CDF2']+df['CDF3']
    
    df['NEW5_lt']=df['LifeTime']*(df['p0_IP']+df['p1_IP']+df['p2_IP'])/3
    return df
    

In [None]:
drop_cols= ['min_ANNmuon','production','signal','mass','SPDhits','FlightDistance','id',
            'isolationa','isolationb','isolationc','isolationd','isolatione','isolationf',
           'DOCAone', 'DOCAtwo', 'DOCAthree','p0_IsoBDT','p1_IsoBDT','p2_IsoBDT',
           'p0_track_Chi2Dof', 'p1_track_Chi2Dof', 'p2_track_Chi2Dof','FlightDistanceError',
           'CDF2','CDF3','CDF1','p0_eta', 'p1_eta', 'p2_eta','LifeTime']

# XGBoost

In [None]:
def fitXGBClassifier(training_set):
    clfXGB = xgb.XGBClassifier(learning_rate =0.1, n_estimators=200, max_depth=3,random_state = 42,n_jobs=-1)
    clfXGB.fit(training_set.drop(drop_cols,axis=1), training_set['signal'])
    return clfXGB

def xgbImportance(clfXGB):
    plt.rcParams["figure.figsize"] = (15, 30)
    plot_importance(clfXGB)
    plt.show()

def getXGBValScore(clfXGB,validation_set):
    train_probs= clfXGB.predict_proba(validation_set.drop(drop_cols,axis=1))[:, 1]
    AUC = roc_auc_truncated(validation_set['signal'], train_probs)
    return AUC

In [None]:
%%time
trn,val= getTrnValPair(train)
clfXGB= fitXGBClassifier(trn)

In [None]:
xgbImportance(clfXGB)

# Uniform boost

This is a boost algorithm based on [paper](https://arxiv.org/pdf/1410.4140.pdf) where we have customized our loss function, which maintains a trade-off between classifier accuracy and mass correlation<br><br>



In [None]:
removeUBoostFeatures= []
def fitUBoostClassifier(training_set):
    tempTrn= training_set.drop(drop_cols+removeUBoostFeatures,axis=1)
    loss = BinFlatnessLossFunction(['mass'], n_bins=15, fl_coefficient=0.1, uniform_label=0)
    uBoost = UGradientBoostingClassifier(loss=loss, n_estimators=40, subsample=0.1, 
                                           max_depth=7, min_samples_leaf=10,
                                           learning_rate=0.1, train_features=tempTrn.columns, random_state=11)
    uBoost.fit(training_set, training_set['signal'])
    return uBoost

def plotUBoostFeatureImportance(baseline,cols):
    plt.rcParams["figure.figsize"] = (15, 30)
    features= baseline.feature_importances_
    indices= numpy.argsort(features)
    # Create plot
    plt.figure()

    # Create plot title
    plt.title("Feature Importance")

    # Add bars
    plt.bar(range(cols.shape[0]), features[indices])

    # Add feature names as x-axis labels
    plt.xticks(range(cols.shape[0]), cols[indices], rotation=90)

    # Show plot
    plt.show()
    
def getValUBoostScore(uBoost,validation_set):
    train_probs= uBoost.predict_proba(validation_set.drop(drop_cols+removeUBoostFeatures,axis=1))[:, 1]
    AUC = roc_auc_truncated(validation_set['signal'], train_probs)
    return AUC

In [None]:
%%time
train= addGemetricFeatures(train);
train= addKinematicFeatures(train);
trn,val= getTrnValPair(train);
uBoost= fitUBoostClassifier(trn);
getValUBoostScore(uBoost,val)

In [None]:
plotUBoostFeatureImportance(uBoost,trn.drop(drop_cols+removeUBoostFeatures,axis=1).columns);

# Agreement Test

In [None]:
ks_drop_cols=[d for d in drop_cols if d not in ['min_ANNmuon','production','mass','id']]
check_agreement = pd.read_csv('../input/flavours-of-physics-kernels-only/check_agreement.csv', index_col='id')
agreement_probs = uBoost.predict_proba(check_agreement.drop(ks_drop_cols,axis=1))[:, 1]

ks = compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
print(ks,ks < 0.09)

# Correlation Check

In [None]:
cvm_drop_cols=[d for d in drop_cols if d not in ['min_ANNmuon','production','signal','id','mass']]
check_correlation = pd.read_csv('../input/flavours-of-physics-kernels-only/check_correlation.csv', index_col='id')
correlation_probs = uBoost.predict_proba(check_correlation.drop(cvm_drop_cols,axis=1))[:, 1]
cvm = compute_cvm(correlation_probs, check_correlation['mass'])
print('CvM metric', cvm, cvm < 0.002)

# Test Submission

In [None]:
test= pd.read_csv('../input/flavours-of-physics-kernels-only/test.csv')
test_probs= uBoost.predict_proba(test)[:, 1]
test['prediction']= test_probs
test[['id','prediction']].to_csv("submission.csv",index=False)