In [2]:
import numpy as np

def binary_append(data, isSig):
    """
    takes in N x m array of predictors (N points in R^m) and appends binary 
    label to each row (1 if signal, 0 if background) to prepare for training
    """
    return np.concatenate((data, np.ones((len(data),1))), axis=1) if isSig \
        else np.concatenate((data, np.zeros((len(data),1))), axis=1)

In [3]:
from sklearn.model_selection import train_test_split

class DataAccess:
    def __init__(self):
        self.datpath = '../data/dat/data50/'
        self.npypath = '../data/npy/data50/'
        self.names = ['sig350G', 'sig500G', 'sig1T', 'sig2T', 'sig4T', 'bgh', 'bg4t', 'bgnoh']
        self.cols = [
            'pT b1', 'pT b2', 'pT b3', 'pT b4',
            'sdEta b1 b2', 'sdEta b1 b3', 'sdEta b1 b4', 'sdEta b2 b3', 'sdEta b2 b4', 'sdEta b3 b4',
            'sdPhi b1 b2', 'sdPhi b1 b3', 'sdPhi b1 b4', 'sdPhi b2 b3', 'sdPhi b2 b4', 'sdPhi b3 b4',
            'dR b1 b2', 'dR b1 b3', 'dR b1 b4', 'dR b2 b3', 'dR b2 b4', 'dR b3 b4',
            'MET', 'pT l', 'MT l MET', 
            'M b1 b2', 'M b1 b3', 'M b1 b4', 'M b2 b3', 'M b2 b4', 'M b3 b4',
            'MT b1 l MET', 'MT b2 l MET', 'MT b3 l MET', 'MT b4 l MET',
            'M j1 j2', 'pT j1', 'pT j2', 'dR j1 j2', 
            'dR b1 l', 'dR b2 l', 'dR b3 l', 'dR b4 l',
            'sdPhi b1 l', 'sdPhi b2 l', 'sdPhi b3 l', 'sdPhi b4 l']
    
    def dat2npy(self):
        for name in self.names:
            print('beginning read of', name + '.npy')
            data = np.loadtxt(self.datpath + name + '.dat')
            print('finished read, now saving')
            data = np.reshape(data, (-1, 47))
            np.save(self.npypath + name, data)
            print('finished saving')
            print()
    
    def getnpy(self, index):
        return np.load(self.npypath + self.names[index] + '.npy')
    
    def getbg(self, amt=1000000):
        return np.concatenate([np.load(self.npypath + name + '.npy')[:int(amt/3)] for name in self.names[-3:]])
    
    def gettraintest(self, index, test_size=0.25, random_state=0):
        return train_test_split(
            np.concatenate([binary_append(self.getnpy(index), True), binary_append(self.getbg(), False)]),
            test_size=test_size,
            random_state=random_state)

In [4]:
DA = DataAccess()

In [5]:
print(len(DA.cols))

47


In [60]:
DA.dat2npy()

beginning read of sig350G.npy
finished read, now saving
finished saving

beginning read of sig500G.npy
finished read, now saving
finished saving

beginning read of sig1T.npy
finished read, now saving
finished saving

beginning read of sig2T.npy
finished read, now saving
finished saving

beginning read of sig4T.npy
finished read, now saving
finished saving

beginning read of bgh.npy
finished read, now saving
finished saving

beginning read of bg4t.npy
finished read, now saving
finished saving

beginning read of bgnoh.npy
finished read, now saving
finished saving

