# Importing required libraries

In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Loading and Labeling

In [14]:
# Load dataset
df = pd.read_csv(r"C:\Users\vudut\OneDrive\Desktop\Python\MINI Project\preprocessed_data.csv")

# Fill missing values in jet_pt (mean imputation)
imputer = SimpleImputer(strategy='mean')
df['jet_pt'] = imputer.fit_transform(df[['jet_pt']])

# Define features and target
X = df.drop(columns=['lep_type'])  # Features
y = df['lep_type']  # Target (can be changed as needed)

# Label encode target if it's categorical
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


# Defining basic neural network

In [15]:
# Neural Network
nn = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(len(np.unique(y_train)), activation='softmax')
])
nn.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
nn.fit(X_train, y_train, epochs=10, batch_size=512, validation_split=0.2, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b0ad0ba470>

# One hot encoding the labels and normalizing in order to fasten the training

In [None]:
# One-hot encode labels (instead of sparse categorical loss)
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Test accuracy: 0.630399763584137


# Defining the next iteration of model

In [None]:
# Model architecture
nn = Sequential([
    Dense(256, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(np.unique(y_train)), activation='softmax')
])

nn.compile(optimizer=Adam(learning_rate=0.0005), 
           loss='categorical_crossentropy', 
           metrics=['accuracy'])

# Add early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Fitting and evaluating the model

In [None]:
# Fit the model
nn.fit(X_train_scaled, y_train_cat, 
       epochs=30, 
       batch_size=1024, 
       validation_split=0.2, 
       callbacks=[early_stop], 
       verbose=1)

# Evaluate
loss, acc = nn.evaluate(X_test_scaled, y_test_cat)
print("Test accuracy:", acc)


# Saving the initial model

In [4]:
nn.save("structured.h5")

# Again repeating the same steps from labeling

In [None]:
# Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert labels to one-hot
y_encoded = to_categorical(y)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Test accuracy: 0.6228587627410889
Model saved as 'improved_model.h5'


# Defining even more layers in the model

In [None]:
# ----- Define your model -----
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(y_encoded.shape[1], activation='softmax')  # output units = number of classes
])

# ----- Compile your model -----
optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# ----- Callbacks -----
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

# Training evaluating and saving this second iteration model

In [None]:
# ----- Train your model -----
history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    callbacks=[early_stop, reduce_lr]
)

# ----- Evaluate on test data -----
loss, accuracy = model.evaluate(X_test, y_test)
print("Test accuracy:", accuracy)

# ----- Save the model -----
model.save("improved_model.h5")
print("Model saved as 'improved_model.h5'")

# Anomaly Detection Using isolation Forest

In [None]:
# Isolation Forest
isoforest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
iso_preds = isoforest.fit_predict(X)

# Convert to 0 (normal) and 1 (anomaly)
iso_anomalies = np.where(iso_preds == -1, 1, 0)
print(f"\n⚠️ Anomaly Detection: Found {np.sum(iso_anomalies)} anomalies out of {len(iso_anomalies)} samples.")



⚠️ Anomaly Detection: Found 149457 anomalies out of 14945674 samples.


## Making test data and evaluating the model

In [22]:
from sklearn.preprocessing import StandardScaler
from joblib import dump

# Initialize and fit the scaler only on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the fitted scaler
X_test_scaled = scaler.transform(X_test)

# Save the fitted scaler for later use
dump(scaler, 'scaler.joblib')
print("Scaler saved as 'scaler.joblib'")


Scaler saved as 'scaler.joblib'


In [None]:
# Make sure you are selecting the target column correctly
y_train = y_train['lep_type'] 
y_test = y_test['lep_type']    

# Now apply LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Transform the test data labels using the fitted encoder
y_test_encoded = label_encoder.transform(y_test)

# Save the label encoder for later use
dump(label_encoder, 'label_encoder.joblib')
print("Label Encoder saved as 'label_encoder.joblib'")


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [4]:
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model("structured.h5")

In [5]:
print(df.columns)

Index(['lep_pt', 'lep_eta', 'lep_phi', 'lep_E', 'lep_charge', 'lep_type',
       'jet_n', 'jet_pt', 'met_et', 'met_phi'],
      dtype='object')


In [7]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from joblib import load

# Load the data
df = pd.read_csv(r"C:\Users\vudut\OneDrive\Desktop\Python\MINI Project\preprocessed_data.csv")

# Separate features and labels
X = df.drop("lep_type", axis=1)  # replace "label" with your actual label column name
y = df["lep_type"]

# Load the scaler and model
scaler = load("scaler.joblib")
model = load_model("structured.h5")

# Scale the features
X_scaled = scaler.transform(X)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
# Select 5 random test samples
sample_indices = np.random.choice(len(X_scaled), size=5, replace=False)
X_test_samples = X_scaled[sample_indices]
y_test_samples = y.iloc[sample_indices].values


In [10]:
from joblib import load
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np

# Load everything
df = pd.read_csv(r"C:\Users\vudut\OneDrive\Desktop\Python\MINI Project\preprocessed_data.csv")
X = df.drop("lep_type", axis=1)
y = df["lep_type"]

# Load the scaler, model, and label encoder
scaler = load("scaler.joblib")
model = load_model("structured.h5")
label_encoder = load("label_encoder.joblib")

# Scale features
X_scaled = scaler.transform(X.values)

# Select random test samples
sample_indices = np.random.choice(len(X_scaled), size=5, replace=False)
X_test_samples = X_scaled[sample_indices]
y_test_samples = y.iloc[sample_indices].values  # original labels for comparison

# Predict
predictions = model.predict(X_test_samples)
predicted_classes_encoded = np.argmax(predictions, axis=1)

# Decode predicted class labels
predicted_classes = label_encoder.inverse_transform(predicted_classes_encoded)

# Show predictions vs actual
for i in range(5):
    print(f"Sample {i+1}:")
    print("Predicted Class:", predicted_classes[i])
    print("Actual Class:   ", y_test_samples[i])
    print("---")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Sample 1:
Predicted Class: 13.0
Actual Class:    13.0
---
Sample 2:
Predicted Class: 13.0
Actual Class:    11.0
---
Sample 3:
Predicted Class: 11.0
Actual Class:    13.0
---
Sample 4:
Predicted Class: 11.0
Actual Class:    11.0
---
Sample 5:
Predicted Class: 13.0
Actual Class:    13.0
---


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from joblib import dump, load
from tensorflow.keras.models import load_model

##############################################
# Part 1: Preprocessing & Saving Scaler/Encoder
##############################################

# Load your dataset
df = pd.read_csv(r"C:\Users\vudut\OneDrive\Desktop\Python\MINI Project\preprocessed_data.csv")

# Fill missing values in 'jet_pt' using mean imputation
imputer = SimpleImputer(strategy='mean')
df['jet_pt'] = imputer.fit_transform(df[['jet_pt']])

# Separate features and target
# Here, 'lep_type' is the target column.
X = df.drop("lep_type", axis=1)
y = df["lep_type"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scaling ---
# Fit the StandardScaler only on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use
dump(scaler, 'scaler.joblib')
print("Scaler saved as 'scaler.joblib'")

# --- Label Encoding ---
# Fit the LabelEncoder on the training target (ensure y_train is a 1D array)
label_encoder = LabelEncoder()
# If y_train is not already a Series, make sure to extract the column
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Save the LabelEncoder for later use
dump(label_encoder, 'label_encoder.joblib')
print("Label Encoder saved as 'label_encoder.joblib'")

##############################################
# Part 2: Inference / Prediction
##############################################

# For inference we will use the entire CSV (you could also use your test set)
df_inference = pd.read_csv(r"C:\Users\vudut\OneDrive\Desktop\Python\MINI Project\preprocessed_data.csv")
X_inference = df_inference.drop("lep_type", axis=1)
y_inference = df_inference["lep_type"]

# Load the saved scaler, trained model, and label encoder
scaler = load("scaler.joblib")
model = load_model("structured.h5")
label_encoder = load("label_encoder.joblib")

# Scale the features using the loaded scaler (convert DataFrame to NumPy array)
X_scaled = scaler.transform(X_inference.values)

# Select 5 random samples from the data for testing predictions
sample_indices = np.random.choice(len(X_scaled), size=5, replace=False)
X_test_samples = X_scaled[sample_indices]
y_test_samples = y_inference.iloc[sample_indices].values

# Get predictions from the model
predictions = model.predict(X_test_samples)

# Convert predictions to the class indices
predicted_classes_encoded = np.argmax(predictions, axis=1)

# Use the loaded LabelEncoder to decode class indices back to the original labels
predicted_classes = label_encoder.inverse_transform(predicted_classes_encoded)

# Display predicted vs. actual for each sample
for i in range(5):
    print(f"Sample {i+1}:")
    print("Predicted Class:", predicted_classes[i])
    print("Actual Class:   ", y_test_samples[i])
    print("---")


Scaler saved as 'scaler.joblib'
Label Encoder saved as 'label_encoder.joblib'




Sample 1:
Predicted Class: 11.0
Actual Class:    11.0
---
Sample 2:
Predicted Class: 13.0
Actual Class:    13.0
---
Sample 3:
Predicted Class: 13.0
Actual Class:    13.0
---
Sample 4:
Predicted Class: 13.0
Actual Class:    11.0
---
Sample 5:
Predicted Class: 13.0
Actual Class:    13.0
---
