# 2 - Learning Pytorch

This tutorial was made in collaboration with Aaron Wang and used his work with Pytorch models.

Make sure to install the Pytorch package before going through this tutorial.

In [1]:
#To install Pytorch, type this into your terminal:
pip install torch torchvision

SyntaxError: invalid syntax (<ipython-input-1-c678d52588d3>, line 2)

Import the packages and open the file.

In [1]:
import torch
from sklearn.model_selection import train_test_split
import os
import h5py
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import matplotlib.pyplot as plt
import tqdm as tqdm

ModuleNotFoundError: No module named 'torch'

Open the file using the h5py package, and use pandas to make a set that contains all of the features and labels (types of jets), while setting the training values using the provided method from sklearn.

In [None]:
f = h5py.File('processed-pythia82-lhc13-all-pt1-50k-r1_h022_e0175_t220_nonu_withPars_truth_0.z', 'r')#Opens file
treeArray = f['t_allpar_new'][()]#Accesses dataset

features = ['j_zlogz', 'j_c1_b0_mmdt','j_c1_b1_mmdt', 'j_c1_b2_mmdt' , 'j_c2_b1_mmdt',
            'j_c2_b2_mmdt', 'j_d2_b1_mmdt', 'j_d2_b2_mmdt', 'j_d2_a1_b1_mmdt', 'j_d2_a1_b2_mmdt', 'j_m2_b1_mmdt', 'j_m2_b2_mmdt', 'j_n2_b1_mmdt', 
            'j_n2_b2_mmdt', 'j_mass_mmdt', 'j_multiplicity']#All the features need to train
labels = ['j_g', 'j_q', 'j_w', 'j_z', 'j_t']#Types of jets that will be used for the training
features_labels_df = pd.DataFrame(treeArray, columns = features + labels)#Makes set with only the values for the features and labels
features_labels_df = features_labels_df.drop_duplicates()
features_val = features_labels_df[features].values

labels_val = features_labels_df[labels].values

X_train, X_test, y_train, y_test = train_test_split(features_val,labels_val,test_size = 0.2, random_state = 42)


Get the input shape to base the layers off of using this method.

In [None]:
X_train.shape

Create the Pytorch model using the Sequential model, by creating multiple Linear layers (4 layers in this case) with a ReLU activation and an output Softmax layer.

In [None]:
model = torch.nn.Sequential(
    torch.nn.Linear(16,64),#Input layer with the shape being 16 nodes
    torch.nn.ReLU(),
    torch.nn.Linear(64,32),
    torch.nn.ReLU(),
    torch.nn.Linear(32,16),
    torch.nn.ReLU(),
    torch.nn.Linear(16,5),
    torch.nn.Softmax())#Output layer for the model

After creating the model, load the training sets for the model to train and produce results.

In [None]:
learning_rate = 1e-4#Controls the rate at which the model learns and trains based on the set
dataloader = DataLoader(X_train, batch_size=64, shuffle=False, sampler=None,#Loading the input training set
           batch_sampler=None, num_workers=0, collate_fn=None,
           pin_memory=False, drop_last=False, timeout=0,
           worker_init_fn=None)
ydataloader = DataLoader(y_train, batch_size=64, shuffle=False, sampler=None,#Loading the expected output training set for comparison
           batch_sampler=None, num_workers=0, collate_fn=None,
           pin_memory=False, drop_last=False, timeout=0,
           worker_init_fn=None)

Set the kind of loss to be plotted to see the effectiveness of the model.

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()#Also known as log loss, measures the performance of a classification model whose output is a probability value between 0 and 1

Train the model.

In [None]:
trainloss = np.zeros(100)
valloss = np.zeros(100)
for t in range(100):
    for x,y in zip(dataloader,ydataloader):
        y_pred = model(x.float())

        loss = loss_fn(y_pred, torch.max(y.float(),1)[1])

        model.zero_grad()

        loss.backward()
        trainloss[t] = loss
        
        with torch.no_grad():
            for param in model.parameters():
                param -= learning_rate * param.grad
    y_pred = model(torch.from_numpy(X_test))
    yloss = loss_fn(y_pred, torch.max(torch.from_numpy(y_test),1)[1])
    valloss[t] = yloss
    print('Epoch' +' ' +str(t) + ' Train Loss:' +str(trainloss[t]))
    print('        ' + 'Val Loss:' + str(valloss[t]))

Make the method to plot the loss per epoch during the training.

In [None]:
def learningCurve(trainloss,valloss):
    plt.figure(figsize=(10,8))
    plt.plot(trainloss)
    plt.plot(valloss)
    plt.title('Model Loss over Epochs')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['training sample loss','validation sample loss'])
    plt.show()
    plt.close()

Plot the model loss over the epochs using the method you just made.

In [None]:
learningCurve(trainloss,valloss)

Finally, make a ROC curve and compare the results of the five-tagger survey.

In [None]:
def makeRoc(features_val, labels_val, labels, model, outputDir='', outputSuffix=''):
    from sklearn.metrics import roc_curve, auc
    labels_pred = model(torch.from_numpy(features_val))
    df = pd.DataFrame()
    fpr = {}
    tpr = {}
    auc1 = {}
    plt.figure(figsize=(10,8))       
    g = labels_pred.detach().numpy()
    for i, label in enumerate(labels):
        df[label] = labels_val[:,i]
        df[label + '_pred'] = g[:,i]
        fpr[label], tpr[label], threshold = roc_curve(df[label],df[label+'_pred'])
        auc1[label] = auc(fpr[label], tpr[label])
        plt.plot(fpr[label],tpr[label],label='%s tagger, AUC = %.1f%%'%(label.replace('j_',''),auc1[label]*100.))
    plt.plot([0, 1], [0, 1], lw=1, color='black', linestyle='--')
    #plt.semilogy()
    plt.xlabel("Background Efficiency")
    plt.ylabel("Signal Efficiency")
    plt.xlim([-0.05, 1.05])
    plt.ylim(0.001,1.05)
    plt.grid(True)
    plt.legend(loc='lower right')
    plt.figtext(0.25, 0.90,'DNN ROC Curve',fontweight='bold', wrap=True, horizontalalignment='right', fontsize=14)
    return labels_pred

In [None]:
z = makeRoc(X_test, y_test, labels, model, outputSuffix='two-layer')