### Version:
2021/02/09
- choose 4-features : pt, mass, deltaeta, deltaphi

# Get 4 features from a jet

This code tends to select four features from per jet data.

In [None]:
import os, gzip
import numpy as np

def read_data(filename):
    # open the data file (txt.gz)
    path = os.path.join(os.path.dirname(__file__), "jet_data/"+filename)
    fin  = gzip.open(path)

    # function for select out a paragraph
    end_of_data = False

    def get_paragraph():
        nonlocal end_of_data
        paragraph = []
        
        # check data
        line = fin.readline().decode()
        if '<jet_data>' not in line:
            end_of_data = True
            return "NULL"
        else:
            paragraph.append(line)

        # readline of data
        while True:
            line = fin.readline().decode()
            paragraph.append(line)
            if '</jet_data>' in line:
                break
        
        return paragraph

    # read all lines of data
    data = []
    while not end_of_data:
        data.append(get_paragraph())

    # close the data file
    fin.close()

    return data

def select_data(input_data):
    # data feature dictionary
    feature_type = {"jet_kinematics":1}
    feature_name = {
        "index":0, # int
        "pt":1, # float
        "eta":2, # float
        "phi":3, # float
        "mass":4, # float
        "deltaeta":5, # float
        "deltaphi":6, # float
        "charge":7, # int
        "ehadovereem":8, # float
        "ncharged":9, # int
        "nneutrals":10 # int
        }

    # select four features
    output_data = []
    for d in input_data:
        dd = d[feature_type["jet_kinematics"]].split()
        ddd = []
        ddd.append(float(dd[feature_name["pt"]]))
        ddd.append(float(dd[feature_name["mass"]]))
        ddd.append(float(dd[feature_name["deltaeta"]]))
        ddd.append(float(dd[feature_name["deltaphi"]]))
        output_data.append(ddd)
    return output_data

## Save to npy

In [None]:
# target file
SRC_SIG = {'filename':'fatjet_w_match_vz_to_ww.txt.gz'}
SRC_BKG = {'filename':'fatjet_q_match_vz_to_qq.txt.gz'}

# read data from file
print("start reading data ...")
w_data = read_data(SRC_SIG['filename']) # len(w_data) = 100,001 with 1 NULL
q_data = read_data(SRC_BKG['filename']) # len(q_data) = 100,001 with 1 NULL
w_data = w_data[:-1]
q_data = q_data[:-1]

# choose four features
print("data selecting ...")
w_data = np.array(select_data(w_data), dtype="float64")
q_data = np.array(select_data(q_data), dtype="float64")

# save to npy
print("data saving ...")
np.save(os.path.join(os.getcwd(), "jet_data/4_var_w.npy"), w_data)
np.save(os.path.join(os.getcwd(), "jet_data/4_var_q.npy"), q_data)
print("successfully saved ...")