In [None]:
#This is a hack to make the %%cpp work inside mybinder.org - ignore
from ROOT import TObject

In [None]:
%%cpp
//Recall we can read a file, get the TTree from it and draw something
ttBarFile = TFile::Open("https://atlas-opendata.web.cern.ch/Legacy13TeV/2lep/MC/mc_410000.ttbar_lep.2lep.root");
miniTree = static_cast<TTree*>(ttBarFile->Get("mini;1"));
TCanvas can;
miniTree->Draw("jet_pt[0]");
can.Draw()

In [None]:
%%cpp
//Another way to do the same thing uses a RDataFrame
ROOT::RDataFrame myDataFrame("mini;1", "https://atlas-opendata.web.cern.ch/Legacy13TeV/2lep/MC/mc_410000.ttbar_lep.2lep.root");
auto myHist = myDataFrame.Histo1D("jet_pt");
can.Clear();
myHist->Draw();
can.Draw()

In [None]:
#Lets repeat in python
from ROOT import RDataFrame, TCanvas
myPythonDataFrame = RDataFrame("mini;1", "https://atlas-opendata.web.cern.ch/Legacy13TeV/2lep/MC/mc_410000.ttbar_lep.2lep.root")
myPythonHist = myPythonDataFrame.Histo1D("jet_n")
myPythonCanvas = TCanvas()
myPythonHist.Draw()
myPythonCanvas.Draw()

In [None]:
#We see above the number of jets varies by event
#What if we want to plot the jet_pt for events with exactly four jets?
#We do the same thing and add a Filter - the string should be c++ style code (will crash if its invalid..)
myJetPtHist = myPythonDataFrame.Filter("4 == jet_n").Histo1D("jet_pt")
myJetPtHist.Draw()
myPythonCanvas.Draw()

In [None]:
#What if we want to plot the leading jet pt only, rather than pt of all jets? Asking to draw jet_pt[0] won't work
#So instead we have to define the variable ourselves and we also filter the data to remove events without jets
#Otherwise accessing jet_pt[0] would cause a crash due to an invalid entry in the array.
myNewDataFrame = myPythonDataFrame.Filter("jet_n > 0").Define("leadingJetPt","return jet_pt[0]")
myLeadingJetPtHist = myNewDataFrame.Histo1D("leadingJetPt")
myLeadingJetPtHist.Draw()
myPythonCanvas.Draw()
#We see this is identical to the histogram we made from the TTree at the start of this workbook.

In [None]:
#Task 1 - create a new data frame with a typical semi-leptonic ttbar event selection consisting of
#exactly one lepton, at least four jets and at least one b-tagged jet
#Then plot the HT variable - this is the scalar sum of jet pt and Missing ET
#Hints: Information about the variables in the open data is here:
#http://opendata.atlas.cern/release/2020/documentation/datasets/dataset13.html

In [None]:
%%cpp
//What if we want to filter our data using variables not part of the TTree?
//We use lambda capture functions.
//First define a cut value to use
float ptCut = 20;
//Then define a lambda function - it looks like usual in c++, except for the [&ptCut] which
//you can ignore.
//note in standard CLING you don't need the "&ptCut" which is implicit.
auto ptCutFunction = [&ptCut](double pt){return pt > ptCut;};
//Then we first make sure we only use data with 2 jets and 1 lepton
auto myFilteredData = myDataFrame.Filter("jet_n >= 2 && lep_n > 1");
//Now use the lambda function inside the Filter function - just name the function and tell
//it the name of the variable to pass as the argument (i.e. pt)
auto myFilteredDataB = myFilteredData.Define("leadingJetPt","return jet_pt[0]").Filter(ptCutFunction,{"leadingJetPt"});
//filter it again using the second jet pt
auto myFilteredDataC = myFilteredDataB.Define("jetPt2","return jet_pt[1]").Filter(ptCutFunction,{"jetPt2"});


In [None]:
#How do we do the same in python?
#Well it would look like this...
#ptCut = 20
#ptCutFunction = lambda pt, ptCut: pt > ptCut
#But RDataFrame does not support that, instead we use a special syntax to define via c++
import ROOT
ROOT.gInterpreter.Declare("""
auto ptCutFunctionB = [](double pt){return pt > 50;};
""")
myFilteredData = myPythonDataFrame.Define("leadingJetPt","return jet_pt[0]").Filter("ptCutFunctionB(leadingJetPt)")

In [None]:
#Can we make this more pythonic? Yes we can translate to python numpy arrays and eventually a pandas data frame
myArray = myPythonDataFrame.AsNumpy(columns=["jet_n","jet_pt"])
import pandas
df = pandas.DataFrame(myArray)

In [None]:
#This looks familiar from Lecture 4, where we got a panda dataframe via uproot
df

In [None]:
#Lets count how many events we have - Count() returns a RResultPtr
#https://root.cern/doc/master/classROOT_1_1RDF_1_1RResultPtr.html
#which has a method GetValue() which returns the value the smart pointer points to
print(myPythonDataFrame.Count().GetValue())
print(myPythonDataFrame.Filter("jet_n>0").Count().GetValue())

In [None]:
#Task 2 print out a cutflow (i.e. the number of events passing each cut applied) for a typical semi-leptonic selection.
#Print out the efficiency too (number of events selected/number of events) of each step.