# Neural Network structure

How should we think about the "layers" and the "nodes?"
When we say "1 hidden layer," what does that mean?

Let's build a network model and count the weights and biases we have there.

In [None]:
import torch.nn as nn

model = nn.Sequential(
    nn.Linear(2, 3),
    nn.Sigmoid(),
    nn.Linear(3, 1),
    nn.Sigmoid()
)

In [None]:
print(model)

In [None]:
for name, param in model.named_parameters():
    if name == '0.weight':
        weight_tensor = param.data
    if name == '0.bias':
        bias_tensor = param.data

print(weight_tensor)
print(bias_tensor)

In [None]:
!pip install torchview
from torchview import draw_graph
import torchvision.models as models

# Draw the graph
model_graph = draw_graph(model, input_size=(1, 2), expand_nested=False)
model_graph.visual_graph

# SUSY classification with Neural Networks and Decision Trees

If decision trees are great with structured data, then we expect them to do very well with the SUSY ML dataset.

Let's compare the performance for the simple DecisionTreeClassifier, XGBoost, and a deep neural network.

First we should develop some standards for the training set and the testing set.
What is a reasonable split of the data between training and testing?

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split

class CSVDataset(Dataset):
    def __init__(self, csv_file, max_samples=None):
        self.data = pd.read_csv(csv_file)
        if max_samples:
            self.data = self.data.iloc[:max_samples]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        features = torch.tensor(row.iloc[1:].values, dtype=torch.float32)
        label = torch.tensor(row.iloc[0], dtype=torch.float32)
        return features, label

In [None]:
# Use it with DataLoader
dataset = CSVDataset('https://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz', 10000)

# Define split ratios (80% train, 20% test)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

# Split the dataset
train_dataset, test_dataset = random_split(
    dataset,
    [train_size, test_size],
    generator=torch.Generator().manual_seed(42)  # For reproducibility
)

print(f"Training samples: {len(train_dataset)}")
print(f"Testing samples: {len(test_dataset)}\n")


In [None]:
# DataLoaders handle batching, shuffling, and parallel loading
training_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
testing_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

We will make sure the training is done on the training dataset.

In [None]:
import torch.nn as nn
import torch.optim as optim

model = nn.Sequential(
    nn.Linear(18, 30),
    nn.Sigmoid(),
    nn.Linear(30, 1),
    nn.Sigmoid()
)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
# Training loop
for epoch in range(num_epochs):
    for batch_features, batch_labels in training_dataloader:
        # Forward pass
        predictions = model(batch_features)
        # Reshape batch_labels to match predictions
        loss = criterion(predictions, batch_labels.unsqueeze(1))

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

But the testing should be done on the testing dataset.
Check to see that the number of events (number of input vectors) is what you expect.

We'll also compare the loss value on the testing data to the last loss value on the training data.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Get all predictions
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch_features, batch_labels in testing_dataloader:
        outputs = model(batch_features)
        predictions = (outputs > 0.5).float()
        loss = criterion(predictions, batch_labels.unsqueeze(1))
        all_predictions.extend(predictions.numpy())
        all_labels.extend(batch_labels.numpy())

print(loss)

# Create confusion matrix
cm = confusion_matrix(all_labels, all_predictions)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

Now let's try the decision tree, as implemented in scikit-learn.
We don't need the PyTorch dataloader here, just the training and testing datasets.
Unfortunately the training and testing datasets were made as Torch tensors, and we need simple

In [None]:
print(train_dataset[0])
print(test_dataset[0])

In [None]:
import numpy as np
import torch

# Prepare training data
train_features_list = []
train_labels_list = []
for features, label in train_dataset:
    train_features_list.append(features)
    train_labels_list.append(label)

X_train = torch.stack(train_features_list).numpy()
y_train = torch.stack(train_labels_list).numpy()

# Prepare testing data
test_features_list = []
test_labels_list = []
for features, label in test_dataset:
    test_features_list.append(features)
    test_labels_list.append(label)

X_test = torch.stack(test_features_list).numpy()
y_test = torch.stack(test_labels_list).numpy()

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier,AdaBoostClassifier
from sklearn import datasets # import inbuild datasets

from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix

score=[]
dtclassifier = DecisionTreeClassifier()
dtclassifier.fit(X_train, y_train)

In [None]:
dtclassifier.score(X_train,y_train),dtclassifier.score(X_test,y_test)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Get predictions from the DecisionTreeClassifier on the test set
y_pred_dt = dtclassifier.predict(X_test)

# Create confusion matrix for Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)

plt.figure(figsize=(6, 5))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Decision Tree Classifier')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

Finally, let's try the XGBoost and see if the decision tree can be improved.

Fortunately XGBoost is also part of scikit-learn, so we can use the same training and testing dataset formats as for the DecisionTreeClassifier.

In [None]:
from xgboost import XGBClassifier
import xgboost
xgbclf = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
xgbclf.fit(X_train, y_train)
# make predictions
preds = xgbclf.predict(X_test)

In [None]:
# Get predictions from the DecisionTreeClassifier on the test set
y_pred_dt = xgbclf.predict(X_test)

# Create confusion matrix for Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)

plt.figure(figsize=(6, 5))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Decision Tree Classifier')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Conclusions

Are the confusion matrices detailed enough to help us understand the performance of the various machine learning methods?

They help us calculate a "false positive rate" and a "true positive rate" for each ML algorithm.
We can explore different working points for each algorithm by changing the tree queries.

Here is an example of the "Receiver Operating Curve" that plots the tradeoff between "false positive rate" and a "true positive rate" on the same plot.

Can you guess which point on the curve represents the best overall performance? [Hint: the dotted blue line is a random classifier (a coin flip).]

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Get predicted probabilities for the positive class
y_pred_proba_xgb = xgbclf.predict_proba(X_test)[:, 1]

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_xgb)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for XGBoost Classifier')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()