<a href="https://colab.research.google.com/github/sidhu2690/E2E-CMS/blob/main/E2E_CMS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#####Importing all required libraries

In [None]:
pip install pyarrow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

####Downloading the data

In [None]:
!wget https://cernbox.cern.ch/remote.php/dav/public-files/ZUHveJKajnZNwTA/QCDToGGQQ_IMGjet_RH1all_jet0_run0_n215556.train.snappy.parquet

####Converting parquet to image files

In [None]:
import time
import numpy as np
import cv2
import pyarrow.parquet as pq
import os

start_time = time.time()
def generate(pf, path, ab):
    record_batch = pf.iter_batches(batch_size=4*1024)
    while True:
        try:
            batch = next(record_batch)
            ab = transform(batch, path, ab)

        except StopIteration:
            return ab

def transform(batch, path, ab):
    p = batch.to_pandas()
    im = np.array(np.array(np.array(p.iloc[:, 0].tolist()).tolist()).tolist())
    meta = np.array(p.iloc[:, 3])
    return saver(im, meta, path, ab)

def saver(im, meta, path, ab):
    alpha, beta = ab

    im[im < 1.e-3] = 0  # Zero_suppression
    im[:, 0, :, :] = (im[:, 0, :, :] - im[:, 0, :, :].mean()) / (im[:, 0, :, :].std())
    im[:, 1, :, :] = (im[:, 1, :, :] - im[:, 1, :, :].mean()) / (im[:, 1, :, :].std())
    im[:, 2, :, :] = (im[:, 2, :, :] - im[:, 2, :, :].mean()) / (im[:, 2, :, :].std())

    for i in range(meta.shape[0]):
        img = im[i, :, :, :]

        channel1 = img[0, :, :]
        channel2 = img[1, :, :]
        channel3 = img[2, :, :]

        channel1 = np.clip(channel1, 0, 500 * channel1.std())
        channel2 = np.clip(channel2, 0, 500 * channel2.std())
        channel3 = np.clip(channel3, 0, 500 * channel3.std())

        channel1 = 255 * (channel1) / (channel1.max())
        channel2 = 255 * (channel2) / (channel2.max())
        channel3 = 255 * (channel3) / (channel3.max())

        img[0, :, :] = channel1
        img[1, :, :] = channel2
        img[2, :, :] = channel3

        img = img.astype(np.uint8)
        img = img.T

        if meta[i] == 0:
            impath = os.path.join(path, "0", str(str(alpha) + ".png"))
            alpha = alpha + 1
        if meta[i] == 1:
            impath = os.path.join(path, "1", str(str(beta) + ".png"))
            beta = beta + 1

        cv2.imwrite(impath, img)

    return [alpha, beta]

def runner(source, target):
    ab = [0, 0]

    os.makedirs(os.path.join(target, "1"), exist_ok=True)
    os.makedirs(os.path.join(target, "0"), exist_ok=True)

    ab = generate(pq.ParquetFile(source), target, ab)

    print("The files were successfully generated")

parquet_file_name = 'QCDToGGQQ_IMGjet_RH1all_jet0_run0_n215556.train.snappy.parquet'
parquet_file_path = f'/kaggle/working/{parquet_file_name}'
output_directory = '/kaggle/working/output/'

runner(source=parquet_file_path, target=output_directory)
end_time = time.time()
Running_time = end_time - start_time
print(f"Time: {Running_time} seconds")

####Simple CNN model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, Dropout

image_size = (128, 128)
num_classes = 1  # Binary classification

cnn_model_bn_with_dropout = Sequential()

cnn_model_bn_with_dropout.add(Conv2D(32, (3, 3), activation='relu', input_shape=(image_size[0], image_size[1], 3)))
cnn_model_bn_with_dropout.add(BatchNormalization())
cnn_model_bn_with_dropout.add(MaxPooling2D(2, 2))
cnn_model_bn_with_dropout.add(Dropout(0.25))  # Adding dropout with a dropout rate of 0.25

cnn_model_bn_with_dropout.add(Conv2D(64, (3, 3), activation='relu'))
cnn_model_bn_with_dropout.add(BatchNormalization())
cnn_model_bn_with_dropout.add(MaxPooling2D(2, 2))
cnn_model_bn_with_dropout.add(Dropout(0.25))  # Adding dropout with a dropout rate of 0.25

cnn_model_bn_with_dropout.add(Conv2D(128, (3, 3), activation='relu'))
cnn_model_bn_with_dropout.add(BatchNormalization())
cnn_model_bn_with_dropout.add(MaxPooling2D(2, 2))
cnn_model_bn_with_dropout.add(Dropout(0.25))  # Adding dropout with a dropout rate of 0.25

cnn_model_bn_with_dropout.add(Conv2D(256, (3, 3), activation='relu'))
cnn_model_bn_with_dropout.add(BatchNormalization())
cnn_model_bn_with_dropout.add(MaxPooling2D(2, 2))
cnn_model_bn_with_dropout.add(Dropout(0.25))  # Adding dropout with a dropout rate of 0.25

cnn_model_bn_with_dropout.add(Flatten())

cnn_model_bn_with_dropout.add(Dense(512, activation='relu'))
cnn_model_bn_with_dropout.add(BatchNormalization())
cnn_model_bn_with_dropout.add(Dropout(0.5))  # Adding dropout with a dropout rate of 0.5

cnn_model_bn_with_dropout.add(Dense(256, activation='relu'))
cnn_model_bn_with_dropout.add(BatchNormalization())
cnn_model_bn_with_dropout.add(Dropout(0.5))  # Adding dropout with a dropout rate of 0.5

cnn_model_bn_with_dropout.add(Dense(num_classes, activation='sigmoid'))

cnn_model_bn_with_dropout.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
#Callbacks
train_loss = []
val_loss = []
class LossCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        train_loss.append(logs['loss'])
        val_loss.append(logs['val_loss'])
loss_callback = LossCallback()

####Train the model for 60 epochs

In [None]:
history = cnn_model_bn.fit(
    x=tf.keras.utils.image_dataset_from_directory(
        '/kaggle/working/output/',
        labels='inferred',
        label_mode='binary',
        image_size=image_size,
        batch_size=32,
        seed=42,
    ),
    epochs=60,
    validation_data=tf.keras.utils.image_dataset_from_directory(
        '/kaggle/working/output/',
        labels='inferred',
        label_mode='binary',
        image_size=image_size,
        batch_size=32,
        seed=42,
    ),
    callbacks=[loss_callback]
)

####Training curves

In [None]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(8, 5))
plt.plot(train_loss, label='Training Loss', marker='o')
plt.plot(val_loss, label='Validation Loss', marker='o', color='orange')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()

plt.show()

Model-2

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, Dropout

image_size = (128, 128)
num_classes = 1  # Binary classification

cnn_model_2 = Sequential()

cnn_model_2.add(Conv2D(32, (3, 3), activation='relu', input_shape=(image_size[0], image_size[1], 3)))
cnn_model_2.add(BatchNormalization())
cnn_model_2.add(MaxPooling2D(2, 2))
cnn_model_2.add(Dropout(0.25))
cnn_model_2.add(Conv2D(64, (3, 3), activation='relu'))
cnn_model_2.add(BatchNormalization())
cnn_model_2.add(MaxPooling2D(2, 2))
cnn_model_2.add(Dropout(0.25))
cnn_model_2.add(Conv2D(128, (3, 3), activation='relu'))
cnn_model_2.add(BatchNormalization())
cnn_model_2.add(MaxPooling2D(2, 2))
cnn_model_2.add(Dropout(0.25))
cnn_model_2.add(Conv2D(256, (3, 3), activation='relu'))
cnn_model_2.add(BatchNormalization())
cnn_model_2.add(MaxPooling2D(2, 2))
cnn_model_2.add(Dropout(0.25))
cnn_model_2.add(Flatten())
cnn_model_2.add(Dense(512, activation='relu'))
cnn_model_2.add(BatchNormalization())
cnn_model_2.add(Dropout(0.5))
cnn_model_2.add(Dense(256, activation='relu'))
cnn_model_2.add(BatchNormalization())
cnn_model_2.add(Dropout(0.5))
cnn_model_2.add(Dense(num_classes, activation='sigmoid'))

cnn_model_2.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
history = cnn_model_bn.fit(
    x=tf.keras.utils.image_dataset_from_directory(
        '/kaggle/working/output/',
        labels='inferred',
        label_mode='binary',
        image_size=image_size,
        batch_size=32,
        seed=42,
    ),
    epochs=60,
    validation_data=tf.keras.utils.image_dataset_from_directory(
        '/kaggle/working/output/',
        labels='inferred',
        label_mode='binary',
        image_size=image_size,
        batch_size=32,
        seed=42,
    ),
    callbacks=[loss_callback]
)

In [None]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(8, 5))
plt.plot(train_loss, label='Training Loss', marker='o')
plt.plot(val_loss, label='Validation Loss', marker='o', color='orange')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()

plt.show()