In [18]:
import numpy as np
import pandas as pd
import os
from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn import preprocessing
from tqdm import tqdm

In [19]:
path_to_train = "train_1"
event_prefix = "event000001000"

hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_prefix))


def get_training_sample(path_to_data, event_names):
        
    events = []
    event_id = 1
    
    for name in event_names:
        # Read an event
        hits, cells, particles, truth = load_event(os.path.join(path_to_data, name))
        event = hits[['hit_id','x','y','z']]
        event = event.merge(truth[['hit_id','particle_id']], on='hit_id')       
        
        le = preprocessing.LabelEncoder()
        le.fit(event['particle_id'])
        event['track_id'] = le.transform(event['particle_id'])
        event.drop(['particle_id'], axis=1, inplace=True)
        #event['event_id'] = event_id
        event_id += 1
        events.append(event)
        print(name)
    
    data = pd.concat(events, axis=0)
    data.reset_index(drop=True, inplace=True)
    #data['hit_id'] = np.arange(1,len(data)+1)
    
    return data

#Take the first 5 event datasets
start_event_id = 1000
n_train_samples = 1
train_event_names = ["event0000{:05d}".format(i) for i in range(start_event_id, start_event_id+n_train_samples)]
train_data = get_training_sample(path_to_train, train_event_names)

train_data.head()

event000001000


Unnamed: 0,hit_id,x,y,z,track_id
0,1,-64.409897,-7.1637,-1502.5,0
1,2,-55.336102,0.635342,-1502.5,477
2,3,-83.830498,-1.14301,-1502.5,0
3,4,-96.1091,-8.24103,-1502.5,3556
4,5,-62.673599,-9.3712,-1502.5,4811


In [20]:
#Truncate the train data
train_data['r'] = np.linalg.norm(train_data[['x','y','z']], axis=1)
#np.percentile(train_data.r,5)
train_data=train_data[train_data.r<80]
train_data.shape

(5727, 6)

In [21]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

#Coordinate transformation
def cart2sphere(cart):
    r = np.linalg.norm(cart, axis=0)
    rp = np.linalg.norm(cart[:2,:], axis=0)
    x1 = cart[0]/r #sin(theta)cos(phi)
    x2 = cart[1]/r #sin(theta)sin(phi)
    x3 = cart[2]/r #cos(theta)
    x4 = cart[2]/rp #arccot(theta)
        
    return np.vstack((x1, x2, x3, x4))


#DBSCAN Clusterering approach
class DBSCANClusterer(object):
    
    def __init__(self, eps):
        self.eps = eps
        
    
    def _preprocess(self, hits):
        
        hits.reset_index(drop=True,inplace=True)
        xyz = hits.loc[:, ['x', 'y', 'z']].values.transpose()

            
        sphere = cart2sphere(xyz) 
        
        hits['x1'] = pd.Series(sphere[0])
        hits['x2'] = pd.Series(sphere[1])
        hits['x3'] = pd.Series(sphere[2])      
        hits['x4'] = pd.Series(sphere[3]) 
        
        ss = StandardScaler()
        X = ss.fit_transform(hits[['x1', 'x2', 'x3', 'x4']].values)       
                
        return X
    
    
    def predict(self, hits, z_shift=0):
        
        #shift the origin of the event
        hits_copy = hits.copy()
        hits_copy.z = hits_copy.z + z_shift
        
        X = self._preprocess(hits_copy)
        
        cl = DBSCAN(eps=self.eps, min_samples=1, algorithm='kd_tree')
        labels = cl.fit_predict(X)
        
        return labels



In [22]:
max_shift = 0
max_score = 0

for shift in tqdm(np.linspace(-5, 5, num=100)):    
  model = DBSCANClusterer(eps=0.01)
  labels = model.predict(train_data, z_shift=shift)
  submission = create_one_event_submission(0, train_data, labels)

  #This is the score of the algorithm on the training set itself
  truth_truncated = truth.merge(submission, on = 'hit_id', how = 'inner')
  score = score_event(truth_truncated, submission)
  
  #print("Your score is {:.5f} when shift is {:.5f}".format(score,shift))
  if score>max_score:
   max_score = score
   max_shift = shift

print('The best shift is {}, with a score of {}.'.format(max_shift, max_score))

for eps in np.linspace(0.001, 0.05, num=10):    
  model = DBSCANClusterer(eps=eps)
  labels = model.predict(train_data, z_shift=max_shift)
  submission = create_one_event_submission(0, train_data, labels)

  #This is the score of the algorithm on the training set itself
  truth_truncated = truth.merge(submission, on = 'hit_id', how = 'inner')
  score = score_event(truth_truncated, submission)
  print("Your score for eps = {:.4f}: {:.5f}".format(eps,score))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.28it/s]


The best shift is -0.050505050505050164, with a score of 0.5775446873238336.
Your score for eps = 0.0010: 0.37539
Your score for eps = 0.0064: 0.54203
Your score for eps = 0.0119: 0.59083
Your score for eps = 0.0173: 0.60477
Your score for eps = 0.0228: 0.59046
Your score for eps = 0.0282: 0.56876
Your score for eps = 0.0337: 0.52964
Your score for eps = 0.0391: 0.48825
Your score for eps = 0.0446: 0.43360
Your score for eps = 0.0500: 0.38528
