# ATLAS Basic Classifier for RPV vs. QCD

Let's put together a simple (shallow) classifier for the RPV signal.

We can try a BDT from sk-learn.

In [69]:
# System imports
from __future__ import print_function
import os

# External imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Magic
%matplotlib notebook

## Prepare the data

Let's specify which data files we'll use and prepare some helper code for reading that data.

In [3]:
# Cori scratch space
#input_dir = os.path.join(os.getenv('SCRATCH'), 'atlasdl', 'prod004')

# Project space (for PDSF)
input_dir = os.path.join(os.getenv('PROJECT'), 'atlas_dl', 'numpy', 'prod004')

In [4]:
ls $input_dir

qcd_JZ10.npz  qcd_JZ3.npz  qcd_JZ6.npz  qcd_JZ9.npz
qcd_JZ11.npz  qcd_JZ4.npz  qcd_JZ7.npz  rpv_1400_850.npz
qcd_JZ12.npz  qcd_JZ5.npz  qcd_JZ8.npz


In [5]:
#samples = ['qcd_JZ3', 'qcd_JZ4', 'qcd_JZ5', 'qcd_JZ6', 'qcd_JZ7',
#           'qcd_JZ8', 'qcd_JZ9', 'qcd_JZ10', 'qcd_JZ11', 'qcd_JZ12',
#           'rpv_1400_850']
samples = ['qcd_JZ3', 'qcd_JZ4',]# 'rpv_1400_850',] 
sample_files = map(lambda s: os.path.join(input_dir, s + '.npz'), samples)
sample_files

['/project/projectdirs/atlas/sfarrell/atlas_dl/numpy/prod004/qcd_JZ3.npz',
 '/project/projectdirs/atlas/sfarrell/atlas_dl/numpy/prod004/qcd_JZ4.npz',
 '/project/projectdirs/atlas/sfarrell/atlas_dl/numpy/prod004/rpv_1400_850.npz']

In [6]:
def get_file_keys(file_name):
    """Retrieves the list of keys from an npz file"""
    with np.load(file_name) as f:
        keys = f.keys()
    return keys

def retrieve_data(file_name, *keys):
    """
    A helper function for retrieving some specified arrays from one npz file.
    Returns a list of arrays corresponding to the requested key name list.
    """
    with np.load(file_name) as f:
        try:
            data = [f[key] for key in keys]
        except KeyError as err:
            print('Requested key not found. Available keys:', f.keys())
            raise
    return data

### Features

Which features do we want to use as input to the classifier?
- jetxPt, jetxEta, jetxPhi, jetxM with x = [1, 2, 3, 4]
  * or should I use the leading 5 jets?
  * this might be a good enough start

Seems like my jets are already sorted by decreasing pt, so that's useful for extracting the features.

In [12]:
def parse_object_features(array, num_objects, default_val=0.):
    """
    Takes an array of object arrays and returns a fixed rank-2 array.
    Clips and pads each element as necessary.
    Output shape is (array.shape[0], num_objects).
    """
    # Create the output first
    length = array.shape[0]
    output_array = np.full((length, num_objects), default_val)
    # Fill the output
    for i in xrange(length):
        k = min(num_objects, array[i].size)
        output_array[i,:k] = array[i][:k]
    return output_array

def prepare_sample_features(sample_file, num_jets=3, max_events=None):
    data = retrieve_data(
        sample_file, 'fatJetPt', 'fatJetEta', 'fatJetPhi', 'fatJetM')
    num_events = data[0].shape[0]
    if max_events is not None and max_events < num_events:
        data = [d[:max_events] for d in data]
    return np.hstack(parse_object_features(a, num_jets) for a in data)

In [None]:
ev_per_sample = 1000
sample_features = [prepare_sample_features(f, max_events=ev_per_sample)
                   for f in sample_files]
sample_labels = [s == 'rpv_1400_850' for s in samples]
sample_events = [sf.shape[0] for sf in sample_features]

In [33]:
X = np.concatenate(sample_features)
sy = [z*np.ones(nevt) for (z, nevt) in zip(sample_labels, sample_events)]
y = np.concatenate(sy)

In [34]:
X.shape, y.shape

((3000, 12), (3000,))

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [72]:
clf = make_pipeline(StandardScaler(), LogisticRegression())
clf.fit(X_train, y_train)
print('Train set accuracy:', clf.score(X_train, y_train))
print('Test set accuracy:', clf.score(X_test, y_test))

Train set accuracy: 0.942222222222
Test set accuracy: 0.941333333333


In [73]:
clf = make_pipeline(StandardScaler(), SVC())
clf.fit(X_train, y_train)
print('Train set accuracy:', clf.score(X_train, y_train))
print('Test set accuracy:', clf.score(X_test, y_test))

Train set accuracy: 0.960444444444
Test set accuracy: 0.953333333333


In [75]:
from sklearn.tree import DecisionTreeClassifier
clf = make_pipeline(StandardScaler(), DecisionTreeClassifier())
clf.fit(X_train, y_train)
print('Train set accuracy:', clf.score(X_train, y_train))
print('Test set accuracy:', clf.score(X_test, y_test))

Train set accuracy: 1.0
Test set accuracy: 0.917333333333


In [74]:
from sklearn.ensemble import RandomForestClassifier
clf = make_pipeline(StandardScaler(), RandomForestClassifier())
clf.fit(X_train, y_train)
print('Train set accuracy:', clf.score(X_train, y_train))
print('Test set accuracy:', clf.score(X_test, y_test))

Train set accuracy: 0.994666666667
Test set accuracy: 0.946666666667


In [76]:
from sklearn.ensemble import GradientBoostingClassifier
clf = make_pipeline(StandardScaler(), GradientBoostingClassifier())
clf.fit(X_train, y_train)
print('Train set accuracy:', clf.score(X_train, y_train))
print('Test set accuracy:', clf.score(X_test, y_test))

Train set accuracy: 0.981777777778
Test set accuracy: 0.945333333333


In [77]:
from sklearn.neural_network import MLPClassifier
clf = make_pipeline(StandardScaler(), MLPClassifier())
clf.fit(X_train, y_train)
print('Train set accuracy:', clf.score(X_train, y_train))
print('Test set accuracy:', clf.score(X_test, y_test))

ImportError: cannot import name MLPClassifier