**Task:-** 

EF-2: Try to make generative model of jet images, using a AUC of a discriminator to distinguish Generative model data / real data as metric

From https://github.com/makagan/SSI_Projects/blob/main/jet_notebooks/1.LHCJetDatasetExploration.ipynbfrom 

In [5]:
## Reading the files
import h5py
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
 # Data already downloaded 
# let's open the file
fileIN = '../jet_notebooks/Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5'
f = h5py.File(fileIN)
# and see what it contains
print(list(f.keys()))

['jetConstituentList', 'jetFeatureNames', 'jetImage', 'jetImageECAL', 'jetImageHCAL', 'jets', 'particleFeatureNames']


In [7]:
# These are the quantities we are dealing with
featurenames = f.get('jetFeatureNames')
print(featurenames[:])
# the b is due to the byte vs utf-8 encoding of the strings in the dataset
# just ignore them for the moment

[b'j_ptfrac' b'j_pt' b'j_eta' b'j_mass' b'j_tau1_b1' b'j_tau2_b1'
 b'j_tau3_b1' b'j_tau1_b2' b'j_tau2_b2' b'j_tau3_b2' b'j_tau32_b1'
 b'j_tau32_b2' b'j_zlogz' b'j_c1_b0' b'j_c1_b1' b'j_c1_b2' b'j_c2_b1'
 b'j_c2_b2' b'j_d2_b1' b'j_d2_b2' b'j_d2_a1_b1' b'j_d2_a1_b2' b'j_m2_b1'
 b'j_m2_b2' b'j_n2_b1' b'j_n2_b2' b'j_tau1_b1_mmdt' b'j_tau2_b1_mmdt'
 b'j_tau3_b1_mmdt' b'j_tau1_b2_mmdt' b'j_tau2_b2_mmdt' b'j_tau3_b2_mmdt'
 b'j_tau32_b1_mmdt' b'j_tau32_b2_mmdt' b'j_c1_b0_mmdt' b'j_c1_b1_mmdt'
 b'j_c1_b2_mmdt' b'j_c2_b1_mmdt' b'j_c2_b2_mmdt' b'j_d2_b1_mmdt'
 b'j_d2_b2_mmdt' b'j_d2_a1_b1_mmdt' b'j_d2_a1_b2_mmdt' b'j_m2_b1_mmdt'
 b'j_m2_b2_mmdt' b'j_n2_b1_mmdt' b'j_n2_b2_mmdt' b'j_mass_trim'
 b'j_mass_mmdt' b'j_mass_prun' b'j_mass_sdb2' b'j_mass_sdm1'
 b'j_multiplicity' b'j_g' b'j_q' b'j_w' b'j_z' b'j_t' b'j_undef']


In [8]:
print(f.get('jetImage'))

<HDF5 dataset "jetImage": shape (10000, 100, 100), type "<f8">


In [9]:
jet_data = np.array(f.get('jets'))
target = jet_data[:,-6:-1]
# shape of the dataset
print("Dataset shape:")
print(target.shape)
print("First five entries:")
for i in range(5):
    print(target[i])
print("Last 5 entries:")
for i in range(-5,0):
    print(target[i])

Dataset shape:
(10000, 5)
First five entries:
[1. 0. 0. 0. 0.]
[1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1.]
[1. 0. 0. 0. 0.]
[0. 0. 0. 1. 0.]
Last 5 entries:
[0. 0. 1. 0. 0.]
[1. 0. 0. 0. 0.]
[0. 1. 0. 0. 0.]
[0. 1. 0. 0. 0.]
[1. 0. 0. 0. 0.]


In [10]:
data = np.array(jet_data[:,:-6])
print(data.shape)

(10000, 53)


In [11]:
labelCat= ["gluon", "quark", "W", "Z", "top"]
# this function makes the histogram of a given quantity for the five classes
def makePlot(feature_index, input_data, input_featurenames):
    plt.subplots()
    for i in range(len(labelCat)):
        # notice the use of numpy masking to select specific classes of jets
        my_data = input_data[np.argmax(target, axis=1) == i]
        # then plot the right quantity for the reduced array
        plt.hist(my_data[:,feature_index], 50, density=True, histtype='step', fill=False, linewidth=1.5)
    plt.yscale('log')    
    plt.legend(labelCat, fontsize=12, frameon=False)
    plt.xlabel(str(input_featurenames[feature_index], "utf-8"), fontsize=15)
    plt.ylabel('Prob. Density (a.u.)', fontsize=15)
    plt.show()
    #del fig, ax
    #return fig, ax

# The particle-list dataset
In this case, we look at the particle-related features that we have stored for each jet constituent. The structure of the dataset is similar to that of the physics-motivated features, except for the fact that we have now a double-index dataset: (jet index, particle index). The list is cut at 100 constituents /jet. If less are found, the dataset is completed filling it with 0s (zero padding)

In [12]:
p_featurenames = f.get("particleFeatureNames")
print(p_featurenames[:])

[b'j1_px' b'j1_py' b'j1_pz' b'j1_e' b'j1_erel' b'j1_pt' b'j1_ptrel'
 b'j1_eta' b'j1_etarel' b'j1_etarot' b'j1_phi' b'j1_phirel' b'j1_phirot'
 b'j1_deltaR' b'j1_costheta' b'j1_costhetarel' b'j1_pdgid']


In [13]:
p_data = f.get("jetConstituentList")
print(p_data.shape)

(10000, 100, 16)


In [14]:
labelCat= ["gluon", "quark", "W", "Z", "top"]
# this function makes the histogram of a given quantity for the five classes
def makePlot_p(feature_index, input_data, input_featurenames):
    plt.subplots()
    for i in range(len(labelCat)):
        my_data = input_data[:,:,feature_index]
        # notice the use of numpy masking to select specific classes of jets
        my_data = my_data[np.argmax(target, axis=1) == i]
        # then plot the right quantity for the reduced array
        plt.hist(my_data[:,feature_index].flatten(), 50, density=True, histtype='step', fill=False, linewidth=1.5)
    plt.yscale('log')    
    plt.legend(labelCat, fontsize=12, frameon=False)  
    plt.xlabel(str(input_featurenames[feature_index], "utf-8"), fontsize=15)
    plt.ylabel('Prob. Density (a.u.)', fontsize=15)
    plt.show()
    #del fig, ax
    #return fig, ax

## Generative model of jet images with AE

In [15]:
import keras
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

# Importing keras and related modules
import tensorflow as tf
from keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from keras.layers import Input,Dense,Flatten,Dropout,Reshape,Conv2D,MaxPooling2D,UpSampling2D,Conv2DTranspose
from tensorflow.keras.layers import BatchNormalization
from keras.models import Model,Sequential
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adadelta, RMSprop,SGD,Adam
from keras import regularizers
from keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape, Lambda
from tensorflow.keras.losses import mse
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Concatenate



# Data Preprocessing



Using the jet image dataset, you can adapt the Graph GAN architecture to generate graph-structured data. Since the jet image dataset isn't inherently graph-structured, we can represent each jet image as a graph using a simplified approach. Here's how you could do it:

1. **Convert Jet Images to Graphs:** Each jet image can be represented as a graph, where the nodes represent pixels, and edges connect neighboring pixels. You can create adjacency matrices for each image by considering the pixel intensities as edge weights.

2. **Graph GAN Architecture:** The architecture remains similar, but instead of nodes and features, you're dealing with pixels and intensities. The generator takes random noise as input and generates adjacency matrices for the graphs. The discriminator evaluates whether a given adjacency matrix represents a real or generated image.

In [16]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LeakyReLU, Flatten, Reshape
from tensorflow.keras.models import Model

In [17]:
# Prepare the data
# Assuming you have jet images in a numpy array 'jet_images'
# Normalize the pixel values between 0 and 1
jet_images = np.array(f.get('jetImage')).astype('float32') / 255.0
num_images, image_size, _ = jet_images.shape


In [None]:
# Convert jet images to adjacency matrices (simplified approach)
adj_matrices = []
for i in range(num_images):
    image = jet_images[i].reshape(-1)  # Flatten image to 1D array
    adjacency_matrix = np.outer(image, image)  # Create adjacency matrix
    adj_matrices.append(adjacency_matrix)
adj_matrices = np.array(adj_matrices)


In [17]:
# Generator architecture
def build_generator(latent_dim, num_nodes):
    input_layer = Input(shape=(latent_dim,))
    x = Dense(128, activation='relu')(input_layer)
    x = Dense(num_nodes * num_nodes, activation='sigmoid')(x)  # Adjust output shape
    generator = Model(input_layer, x)
    return generator

# Discriminator architecture
def build_discriminator(num_nodes):
    input_layer = Input(shape=(num_nodes * num_nodes,))
    x = Dense(128)(input_layer)
    x = LeakyReLU(0.2)(x)
    x = Dense(64)(x)
    x = LeakyReLU(0.2)(x)
    output_layer = Dense(1, activation='sigmoid')(x)
    discriminator = Model(input_layer, output_layer)
    return discriminator


Shape of the dataset: (10000, 100, 100)
image_size is, 100


In [18]:

# Build and compile the generator and discriminator
latent_dim = 32
num_nodes = image_size * image_size
generator = build_generator(latent_dim, num_nodes)
discriminator = build_discriminator(num_nodes)

generator.compile(optimizer='adam', loss='binary_crossentropy')
discriminator.compile(optimizer='adam', loss='binary_crossentropy')

# The GAN architecture
discriminator.trainable = False
gan_input = Input(shape=(latent_dim,))
generated_adj_matrices = generator(gan_input)
gan_output = discriminator(generated_adj_matrices)
gan = Model(gan_input, gan_output)

gan.compile(optimizer='adam', loss='binary_crossentropy')

# Training loop
num_epochs = 100
batch_size = 32

for epoch in range(num_epochs):
    for _ in range(num_images // batch_size):
        idx = np.random.randint(0, num_images, batch_size)
        real_adj_matrices = adj_matrices[idx]
        latent_space_samples = np.random.normal(0, 1, (batch_size, latent_dim))
        generated_adj_matrices = generator.predict(latent_space_samples)
        
        d_loss_real = discriminator.train_on_batch(real_adj_matrices.reshape((batch_size, num_nodes * num_nodes)), np.ones((batch_size, 1)))
        d_loss_fake = discriminator.train_on_batch(generated_adj_matrices, np.zeros((batch_size, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        latent_space_samples = np.random.normal(0, 1, (batch_size, latent_dim))
        g_loss = gan.train_on_batch(latent_space_samples, np.ones((batch_size, 1)))
        
    print(f"Epoch {epoch+1}, D Loss: {d_loss:.4f}, G Loss: {g_loss:.4f}")