In [None]:
!pip install lightgbm
!pip install optuna
!pip install imblearn



In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [None]:
'''
Will load and flatten the dataset, then turn 3d data into 2d ie sets of particles will be converted to multiple entries of individual
particles with event IDs, then merging the data and using SMOTE
'''
# Load datasets
file1 = np.load('/content/QG_jets.npz')
file2 = np.load('/content/QG_jets_1.npz')
file3 = np.load('/content/QG_jets_14.npz')
X1, y1 = file1['X'], file1['y']
X2, y2 = file2['X'], file2['y']
X3, y3 = file3['X'], file3['y']

# Function to process events
def process_events(X, y):
    num_events, num_particles, num_features = X.shape
    X_reshaped = X.reshape(-1, num_features)
    event_labels = np.repeat(y, num_particles)
    event_ids = np.repeat(np.arange(num_events), num_particles)
    return X_reshaped, event_labels, event_ids

# Process datasets
X1_proc, y1_proc, event_ids1 = process_events(X1, y1)
X2_proc, y2_proc, event_ids2 = process_events(X2, y2)
X3_proc, y3_proc, event_ids3 = process_events(X3, y3)

# Combine datasets
X_combined = np.concatenate([X1_proc, X2_proc, X3_proc], axis=0)
y_combined = np.concatenate([y1_proc, y2_proc, y3_proc], axis=0)
event_ids_combined = np.concatenate([event_ids1, event_ids2, event_ids3], axis=0)

# Create DataFrame
df_particles = pd.DataFrame(X_combined, columns=['pT', 'Rapidity', 'Phi', 'PDGID'])
df_particles['EventID'] = event_ids_combined
df_particles = df_particles[df_particles['pT'] != 0]

# Add event labels
df_particles['EventLabel'] = y_combined[df_particles.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_particles['EventLabel'] = y_combined[df_particles.index]


In [None]:
'''
Cutting down on DataFrame due to computational expense
'''
df_particles = df_particles.sample(frac=1/3, random_state=42).reset_index(drop=True)

In [None]:
'''
Using SMOTE now to identify the minority class, creating synthetic samples (in between two already existing samples)
-> will balance dataset
'''

X_for_Split = df_particles[['pT', 'Rapidity', 'Phi', 'PDGID']]
Y_for_Split = df_particles['EventLabel']
smote = SMOTE(random_state = 3)
X_final, Y_final = smote.fit_resample(X_for_Split, Y_for_Split)

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y_final, test_size=0.2, random_state=3)

# Check the training labels
print(X_train)

                pT  Rapidity       Phi  PDGID
1895711   0.197004  0.307349  4.256269 -321.0
3673504   1.044185 -0.297651  1.242695   22.0
2446179   1.936297 -0.928100  0.975068  211.0
2696472   0.716918  0.495493  2.554583  211.0
3718479   0.543804 -0.378393 -0.042480   22.0
...            ...       ...       ...    ...
1008584   2.215158  1.260680  0.440102   11.0
452227   21.193859 -0.360843  3.229057   22.0
4795641   5.484217 -0.440632  0.485342   22.0
1771160   1.845783  1.225236  2.998239 -211.0
71530     0.522798 -0.545662  3.857524   22.0

[4253492 rows x 4 columns]


In [None]:
'''
Configuring basic model and testing
'''

lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, Y_train)

# Evaluate
Y_pred = lgb_model.predict(X_test)
train_accuracy = lgb_model.score(X_train, Y_train)
test_accuracy = lgb_model.score(X_test, Y_test)
print(f'Training accuracy: {train_accuracy:.4f}')
print(f'Testing accuracy: {test_accuracy:.4f}')

Training accuracy: 0.5464
Testing accuracy: 0.5456


In [21]:
def tune(trial):
  p = {
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'random_state': 42,
        'verbosity': -1  }
  model = lgb.LGBMClassifier(**p)
  model.fit(X_train,Y_train,eval_set=[(X_test,Y_test),(X_train,Y_train)],eval_metric='logloss')
  train_accuracy = model.score(X_train, Y_train)
  test_accuracy = model.score(X_test, Y_test)
  print(f'Training accuracy: {train_accuracy:.4f}')
  print(f'Testing accuracy: {test_accuracy:.4f}')

  # Return the testing accuracy for optimization
  return test_accuracy


In [22]:
opt = optuna.create_study(direction = 'maximize')
opt.optimize(tune, n_trials = 15)
print("Best trial:")
trial = opt.best_trial
print(f"  Accuracy: {trial.value}")
print("  Hyperparameter Tuning Result: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2025-01-27 17:21:55,362] A new study created in memory with name: no-name-a45e944d-66b6-4303-8e91-749005525d2a
[I 2025-01-27 17:24:00,617] Trial 0 finished with value: 0.5466580901921619 and parameters: {'learning_rate': 0.12268572811509736, 'num_leaves': 50, 'max_depth': 10, 'n_estimators': 174}. Best is trial 0 with value: 0.5466580901921619.


Training accuracy: 0.5493
Testing accuracy: 0.5467


[I 2025-01-27 17:25:53,559] Trial 1 finished with value: 0.5470803310970552 and parameters: {'learning_rate': 0.19737567831284208, 'num_leaves': 48, 'max_depth': 9, 'n_estimators': 179}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5511
Testing accuracy: 0.5471


[I 2025-01-27 17:27:31,888] Trial 2 finished with value: 0.5463327107866094 and parameters: {'learning_rate': 0.13354392333099285, 'num_leaves': 41, 'max_depth': 7, 'n_estimators': 133}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5480
Testing accuracy: 0.5463


[I 2025-01-27 17:29:30,884] Trial 3 finished with value: 0.5459621920415583 and parameters: {'learning_rate': 0.10124218984311545, 'num_leaves': 27, 'max_depth': 10, 'n_estimators': 173}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5471
Testing accuracy: 0.5460


[I 2025-01-27 17:31:53,785] Trial 4 finished with value: 0.5469016545448732 and parameters: {'learning_rate': 0.19216272526909622, 'num_leaves': 39, 'max_depth': 8, 'n_estimators': 238}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5512
Testing accuracy: 0.5469


[I 2025-01-27 17:34:22,641] Trial 5 finished with value: 0.5459574900270272 and parameters: {'learning_rate': 0.08601035233411654, 'num_leaves': 20, 'max_depth': 8, 'n_estimators': 224}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5468
Testing accuracy: 0.5460


[I 2025-01-27 17:35:53,324] Trial 6 finished with value: 0.5460327222595248 and parameters: {'learning_rate': 0.14911966329736218, 'num_leaves': 27, 'max_depth': 10, 'n_estimators': 133}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5473
Testing accuracy: 0.5460


[I 2025-01-27 17:38:06,794] Trial 7 finished with value: 0.5462697037918925 and parameters: {'learning_rate': 0.10650021449225841, 'num_leaves': 42, 'max_depth': 8, 'n_estimators': 181}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5480
Testing accuracy: 0.5463


[I 2025-01-27 17:41:11,281] Trial 8 finished with value: 0.546943032272747 and parameters: {'learning_rate': 0.11982307136167332, 'num_leaves': 49, 'max_depth': 9, 'n_estimators': 288}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5514
Testing accuracy: 0.5469


[I 2025-01-27 17:43:49,354] Trial 9 finished with value: 0.547015443296526 and parameters: {'learning_rate': 0.16998919671291174, 'num_leaves': 42, 'max_depth': 9, 'n_estimators': 261}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5520
Testing accuracy: 0.5470


[I 2025-01-27 17:45:09,716] Trial 10 finished with value: 0.5449352720679648 and parameters: {'learning_rate': 0.0513359336235753, 'num_leaves': 33, 'max_depth': 5, 'n_estimators': 104}. Best is trial 1 with value: 0.5470803310970552.


Training accuracy: 0.5456
Testing accuracy: 0.5449


[I 2025-01-27 17:47:53,813] Trial 11 finished with value: 0.5473398822991723 and parameters: {'learning_rate': 0.1988033304666703, 'num_leaves': 45, 'max_depth': 9, 'n_estimators': 284}. Best is trial 11 with value: 0.5473398822991723.


Training accuracy: 0.5539
Testing accuracy: 0.5473


[I 2025-01-27 17:50:24,494] Trial 12 finished with value: 0.5467681173321898 and parameters: {'learning_rate': 0.19645133381488175, 'num_leaves': 47, 'max_depth': 6, 'n_estimators': 211}. Best is trial 11 with value: 0.5473398822991723.


Training accuracy: 0.5512
Testing accuracy: 0.5468


[I 2025-01-27 17:53:05,319] Trial 13 finished with value: 0.5475618173850404 and parameters: {'learning_rate': 0.16830237874701595, 'num_leaves': 45, 'max_depth': 9, 'n_estimators': 289}. Best is trial 13 with value: 0.5475618173850404.


Training accuracy: 0.5529
Testing accuracy: 0.5476


[I 2025-01-27 17:55:58,483] Trial 14 finished with value: 0.5468621576228119 and parameters: {'learning_rate': 0.168583192563389, 'num_leaves': 35, 'max_depth': 7, 'n_estimators': 299}. Best is trial 13 with value: 0.5475618173850404.


Training accuracy: 0.5512
Testing accuracy: 0.5469
Best trial:
  Accuracy: 0.5475618173850404
  Hyperparameter Tuning Result: 
    learning_rate: 0.16830237874701595
    num_leaves: 45
    max_depth: 9
    n_estimators: 289


In [28]:
final_model = lgb.LGBMClassifier(
    learning_rate = 0.16830237874701595,
    num_leaves = 45,
    max_depth = 9,
    n_estimators = 289
)
final_model.fit(X_train, Y_train)
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(Y_test, y_pred))

Final Model Accuracy: 0.5472
Classification Report:
               precision    recall  f1-score   support

         0.0       0.53      0.73      0.62    531504
         1.0       0.58      0.36      0.44    531870

    accuracy                           0.55   1063374
   macro avg       0.55      0.55      0.53   1063374
weighted avg       0.55      0.55      0.53   1063374

