In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import xgboost

In [2]:
tracks = pd.read_csv('./data/DS_1_train.gz', index_col=0, compression='gzip')
electrons = pd.read_csv('./data/DS_1_electron_train.gz', compression='gzip')
test_tracks = pd.read_csv('./data/DS_1_test.gz', index_col=0, compression='gzip')
test_electrons = pd.read_csv('./data/DS_1_electron_test.gz', compression='gzip')

  mask |= (ar1 == a)


In [3]:
tracks.head()

Unnamed: 0_level_0,event_id,X,Y,Z,TX,TY,chi2,signal
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,-999,42720.238281,75152.40625,32325.0,-0.475539,0.405506,2.652586,0.0
1,-999,46023.816406,37043.359375,21981.0,0.330822,-0.013988,2.497527,0.0
2,-999,52706.117188,33872.730469,16809.0,0.342634,-0.058724,2.120421,0.0
3,-999,47334.101562,9698.53125,60771.0,0.007868,-0.632822,0.618535,0.0
4,-999,67380.835938,19137.265625,37497.0,-0.212537,-0.192243,2.799303,0.0


In [4]:
tracks[tracks["event_id"] !=-999].head()

Unnamed: 0_level_0,event_id,X,Y,Z,TX,TY,chi2,signal
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
29,411,53290.710938,22941.925781,63357.0,-0.258045,0.710882,1.846556,1.0
32,27354,36921.367188,21285.517578,72408.0,-0.924607,0.81938,0.849372,1.0
37,5586,43593.214844,56103.648438,45255.0,-0.330734,-0.46748,0.646188,1.0
39,3009,54343.628906,49445.316406,42669.0,0.242685,-0.080539,0.413605,1.0
41,739,41051.804688,34390.445312,60771.0,0.035552,-0.04268,0.670655,1.0


In [5]:
electrons.head()

Unnamed: 0,event_id,P,X,Y,Z,TX,TY
0,43311,6.059411,50445.277344,18868.576172,32163.242188,-0.145684,-0.044486
1,44301,4.829437,44404.660156,24444.984375,10518.998047,-0.044544,0.132461
2,41829,4.929091,49269.140625,45781.617188,37460.796875,0.06094,0.10149
3,38613,1.587992,44798.660156,43921.625,35979.109375,-0.332825,0.115194
4,18087,7.594307,45228.773438,43913.035156,28341.851562,0.021005,0.058128


In [6]:
def find_near_electrons(tracks, electrons, n_ele_to_store):
    n_ele = len(electrons["X"])
    n_tracks = len(tracks["X"])
    tr_pos = tracks[["X", "Y", "Z"]].values
    tr_pos = tr_pos.reshape(-1, 1, 3)
    ele_pos = electrons[["X", "Y", "Z"]].values
    ele_pos = ele_pos.reshape(1, -1, 3)
    d_pos = ele_pos - tr_pos
    l = np.column_stack((electrons[["TX"]], electrons[["TY"]], np.ones(n_ele)))
    l_norm = np.linalg.norm(l, axis=1)
    l = l / l_norm.reshape(-1, 1)
    l = l.reshape(1, -1, 3)
    cr = np.cross(d_pos, l)
    d = np.linalg.norm(cr, axis=2)
    d_ext = np.zeros([n_tracks, n_ele, 2])
    d_ext[:, :, 0] = d
    d_ext[:, :, 1] = np.arange(0, n_ele)
    for n in range(0, n_tracks):
        d_ext[n] = d_ext[n, d_ext[n,:,0].argsort()]
    
    tr_dir = np.column_stack((tracks[["TX"]], tracks[["TY"]], np.ones(n_tracks)))
    tr_dir_norm = np.linalg.norm(tr_dir, axis=1)
    tr_dir = tr_dir / tr_dir_norm.reshape(-1, 1)
    tr_dir = tr_dir.reshape(-1, 1, 3)
    dir_cr = np.cross(tr_dir, l)
    dir_norm = np.linalg.norm(dir_cr, axis=2)
    
    var_name_suffixes = [ ['idx', 'int32'], ['dist'], ['z'], ['r'], ['phi'], ['dir'] ]
    var_names = []
    dtype_array = []
    for n in range(0, n_ele_to_store):
        for suffix in var_name_suffixes:
            var_name = 'ele{}_{}'.format(n+1, suffix[0])
            var_names.append(var_name)
            var_type = 'float64'
            if len(suffix) > 1:
                var_type = suffix[1]
            dtype_array.append((var_name, var_type))
    
    res = np.zeros(n_tracks, dtype=(dtype_array))
    e_r = np.sqrt(electrons["X"] ** 2 + electrons["Y"] ** 2)
    e_phi = np.arctan2(electrons["Y"], electrons["X"])
    e_z = np.array(electrons[["Z"]].values).T[0]
    
    for n in range(0, n_ele_to_store): 
        prefix = 'ele{}_'.format(n+1)
        idx = prefix + 'idx'
        res[:][prefix +'idx'] = d_ext[:, n, 1]
        res[:][prefix +'dist'] = d_ext[:, n, 0]
        res[:][prefix + 'z'] = e_z[res[:][idx]]
        res[:][prefix +'r'] = e_r[res[:][idx]]
        res[:][prefix + 'phi'] = e_phi[res[:][idx]]
        for n in range(0, n_tracks):
            res[n][prefix + 'dir'] = dir_norm[n, res[n][idx]]

    return res,var_names

In [7]:
def update_tracks(tracks, electrons, n_ele_to_store):
    n_tracks = len(tracks["X"])
    
    r = np.sqrt(tracks["X"] ** 2 + tracks["Y"] ** 2)
    phi = np.arctan2(tracks["Y"], tracks["X"])
    tracks["R"] = pd.Series(r, tracks.index)
    tracks["phi"] = pd.Series(phi, tracks.index)
    
    names = ["ele{}_dist", "ele{}_z", "ele{}_r", "ele{}_phi", "ele{}_dir"]
    ele_match,var_names = find_near_electrons(tracks, electrons, n_ele_to_store)
    for name in var_names:
        tracks[name] = pd.Series(ele_match[:][name], tracks.index)
    return var_names

In [8]:
n_ele_to_store = 4
type_names = update_tracks(tracks, electrons,n_ele_to_store)
tracks.head()

Unnamed: 0_level_0,event_id,X,Y,Z,TX,TY,chi2,signal,R,phi,...,ele3_z,ele3_r,ele3_phi,ele3_dir,ele4_idx,ele4_dist,ele4_z,ele4_r,ele4_phi,ele4_dir
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-999,42720.238281,75152.40625,32325.0,-0.475539,0.405506,2.652586,0.0,86445.953774,1.0539,...,28343.957031,73712.367346,1.054133,0.494994,60,13169.542192,36184.230469,84592.017016,0.896431,0.608639
1,-999,46023.816406,37043.359375,21981.0,0.330822,-0.013988,2.497527,0.0,59079.62551,0.677706,...,4156.058594,57881.604298,0.678611,0.272449,43,4222.349577,28342.835938,53970.999352,0.663548,0.54566
2,-999,52706.117188,33872.730469,16809.0,0.342634,-0.058724,2.120421,0.0,62652.187978,0.571206,...,10524.460938,64564.185547,0.619916,0.252169,80,4665.237803,35980.671875,66103.912132,0.520914,0.325657
3,-999,47334.101562,9698.53125,60771.0,0.007868,-0.632822,0.618535,0.0,48317.477988,0.202098,...,16884.892578,55278.95807,0.281771,0.523441,78,6771.537654,27066.84375,63920.578342,0.530128,0.175096
4,-999,67380.835938,19137.265625,37497.0,-0.212537,-0.192243,2.799303,0.0,70045.784936,0.276729,...,25792.894531,71572.769721,0.363856,0.177727,74,7026.170707,28342.261719,63806.854656,0.300988,0.309728


In [9]:
names = ["ele{}_dist", "ele{}_dir"]
features = ["X", "Y", "Z", "TX", "TY", "chi2"]
for n in range(0, n_ele_to_store):
    for name_pattern in names:
        name = name_pattern.format(n+1)
        features.append(name)
features.append("ele1_z")
train, test = train_test_split(tracks, random_state=12526)
X_train = train[features]
X_test = test[features]
Y_train = train['signal']
Y_test = test['signal']

In [10]:
%%time
xgb = xgboost.XGBClassifier(n_jobs=-1, n_estimators=1000, learning_rate=0.1).fit(X_train,Y_train)

CPU times: user 33min 12s, sys: 9.27 s, total: 33min 21s
Wall time: 10min 52s


In [11]:
train_score = roc_auc_score(train.signal, xgb.predict_proba(X_train)[:,1])
test_score = roc_auc_score(test.signal, xgb.predict_proba(X_test)[:,1])
print("train score = {}".format(train_score))
print("test score = {}".format(test_score))

train score = 0.986973204195
test score = 0.983712879899


In [12]:
indices = xgb.feature_importances_.argsort()
N = len(indices)
for n in range(0, N):
    print("{}\t{}\t{}".format(n+1, features[indices[N-n-1]], xgb.feature_importances_[indices[N-n-1]]))

1	TY	0.120845459402
2	TX	0.10246592015
3	Y	0.0972583889961
4	ele1_dist	0.0880686193705
5	chi2	0.0874559655786
6	ele1_z	0.080104149878
7	X	0.0748966187239
8	Z	0.0738244727254
9	ele1_dir	0.0537601485848
10	ele4_dist	0.0506968908012
11	ele2_dist	0.040588144213
12	ele3_dist	0.0350742824376
13	ele2_dir	0.034155305475
14	ele3_dir	0.0321641899645
15	ele4_dir	0.0286414455622


In [13]:
update_tracks(test_tracks, test_electrons, n_ele_to_store)
test_tracks.head()

Unnamed: 0_level_0,X,Y,Z,TX,TY,chi2,R,phi,ele1_idx,ele1_dist,...,ele3_z,ele3_r,ele3_phi,ele3_dir,ele4_idx,ele4_dist,ele4_z,ele4_r,ele4_phi,ele4_dir
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,37259.390625,62516.367188,63357.0,-0.475465,-0.250186,1.405402,72777.457747,1.033327,20,1656.943547,...,18156.632812,72557.808854,1.068318,0.442773,12,4921.861039,30890.998047,65011.432313,0.991393,0.508955
1,41426.3125,57520.265625,25860.0,-0.213876,0.155208,2.541856,70885.26169,0.946639,8,2108.729626,...,25796.195312,68638.749399,0.9266,0.27336,48,3521.911935,27072.75,74425.274799,0.954485,0.119747
2,40752.921875,54653.265625,46548.0,0.044215,-0.333426,2.389629,68174.629334,0.930077,24,1831.317158,...,23250.978516,58783.896002,0.801217,0.572825,39,5908.756495,25796.195312,68638.749399,0.9266,0.557436
3,29293.980469,23582.425781,14223.0,-0.265671,-0.137444,2.402328,37606.756008,0.677798,6,4323.351308,...,23251.027344,46875.813489,0.483417,0.482759,1,11123.404153,13067.279297,46436.67404,0.514676,0.286588
4,49538.128906,63314.617188,31032.0,-0.063467,-0.550298,1.22346,80391.336381,0.906872,64,2218.556673,...,37248.894531,77353.65359,0.912641,0.683651,0,4452.876518,20703.253906,75853.900799,0.908343,0.509889


In [14]:
X_final_test = test_tracks[features]

In [15]:
prediction = xgb.predict_proba(X_final_test)[:, 1]

In [16]:
baseline = pd.DataFrame(prediction, columns=['Prediction'])
baseline.index.name = 'Id'
baseline.to_csv('results_v5.gz', header=True, compression='gzip')