In [1]:
import pandas as pd
import numpy as np
import copy 

In [2]:
df = pd.read_csv("keeping_sorted.csv")

In [3]:
datasets = df["dataset"]
location = df["rse_id"]
tier = df["DTier"]
size_on_tape = df["size_on_tape_[TB]"]

In [4]:
exceptions = pd.read_csv("exceptions_fall2022.txt")

In [5]:
exception_list = exceptions["item"]

## Total size on tape

In [6]:
np.sum(size_on_tape)

144805.72161000004

In [7]:
keep = np.full( np.size(datasets), False )

## Breakdown of campaign to "human readable" format

In [8]:
categories = {

    "Skims":["USER"],
    "Commissioning + Cosmics":[
       "Commissioning08",     
       "Commissioning10",  
       "Commissioning11",  
       "Commissioning12",  
       "Commissioning2015",
       "Commissioning2017",
       "Commissioning2018",
       "Commissioning2021",
       "Commissioning2022",
    ],
    
    "Cosmics MC":[
        "RunIISpring18CosmicDR"
    ],
        
    "Run 3":[
        "Run3Summer19DRPremix",       
        "Run3Summer19DR",  
        "Run3Winter20",
        "Run3Winter20DRPremixMiniAOD",
        "Run3Winter21DRMiniAOD",      
        "Run3Summer21",               
        "Run3Winter21",               
        "Run3Summer21GS",             
        "Run3Summer21PrePremix",      
        "Run3Summer21wmLHEGS",        
        "Run3Summer22",                  
    ],
    
    "Run 2 legacy MC":[
        "Summer19UL",
        "Summer20UL",
        "Spring21UL",
    ],
    
    "Run 2 legacy data":[
        "UL2016",
        "UL2017",
        "UL2018",
    ],
    
    "Run 2 pre-legacy data":[
        "23Sep2016",
        "18Apr2017",
        "07Aug17",
        "12Sep2017",
        "17Nov2017",
        "09May2018",
        "06Jun2018",
        "17Sep2018",
        "22Jan2019",
        "Run2018D-PromptReco",
        "RunIIWinter17DR",
    ],

    "Run 2 pre-legacy MC":[
        "RunIISpring15PrePremix",    
        "RunIIFall15DR76",           
        "RunIISummer16DR80",         
        "RunIISummer16DR80Premix",   
        "RunIIFall17GS",             
        "RunIIFall17DRPremix",       
        "RunIISummer17PrePremix",    
        "RunIIFall17FSPremix",       
        "RunIIFall17FSPrePremix",    
        "RunIIFall18wmLHEGS",        
        "RunIIFall18GS",             
        "RunIIAutumn18DRPremix",     
        "RunIIAutumn18DR",           
        "RunIIAutumn18RECOBParking", 
        "RunIIAutumn18FSPremix",        
        "RunIIWinter19PFCalib17wmLHEGS",
        "RunIISpring18DRPremix",        
        "RunIIFall17DRStdmix",          
        "RunIISummer17DRStdmix",        
        "RunIISpring18DR",
        "RunIISpring18GS",
        "RunIISpring15DR74",            
        "RunIISummer16FSPremix",        
        "RunIISummer15wmLHEGS",         
        "RunIIWinter19CosmicDR",        
        "RunIIFall17wmLHEGS",           
        "RunIIWinter19PFCalib16wmLHEGS",
        "RunIIFall17wmLHEGENOnly",      
        "RunIIWinter15GS",
        "RunIISpring16FSPremix",
        "RunIIWinter19PFCalib16GS",
        "RunIILowPUAutumn18GS",
        "RunIIWinter19PFCalib",
        "RunIISummer15"
    ],
    
    "B-Parking":[
        "ParkingBPH",
        "SkimBPark"
    ],    
    
    "HIN":[
        "HIRun2010", 
        "HIRun2011", 
        "HIRun2013", 
        "HIRun2015", 
        "PARun2016", 
        "HIRun2018", 
        "HINPbPbAutumn18DR",
        "HiWinter13", 
        "pPb816Summer16DR",
        "HINPbPbWinter16DR",
        "HiFall13DR53X",
        "pAWinter13DR53X",
        "XeXeRun2017",
        "HIRun2013A",
        "HINppWinter16DR",
        "pPb816Spring16GS",
        "HiFall13",
    ],

    "ALCA":[
        "StreamExpressAlignment",
        "TkAlCosmicsInCollisions",
        "ALCARECO"
    ],
    
    "DPOA":[
        "LowPU2010",
        "Run2010",
        "Run2010",
        "Run2010",
        "Run2011",
        "Run2011",
        "Run2011",
        "Run2012",
        "Run2012",
        "Summer11",
        "Summer11Leg",
        "Fall11",
        "Summer11LegDR",
        "Summer12DR53X",
        "Summer11dr53X",
        "Summer12_DR53X",
        "Summer12",
        "Summer12",
        "Summer13dr53X",
        "Run2015C_25ns",
        "Run2015",
        "Run2015",
    ],
    
    "Upgrades":[
        "Phase2HLTTDRWinter20DIGI", 
        "Phase2HLTTDRSummer20ReRECOMiniAOD",
        "PhaseIITDRSpring17DR", 
        "PhaseIIFall16DR82",
        "PhaseIISummer17wmLHEGENOnly",
        "GEM2019Upg14DR",
        "PhaseISpring17DR", 
        "PhaseIFall16DR", 
        "Phase2HLTTDRWinter20GS",
        "PhaseIIMTDTDRAutumn18DR",
        "PhaseIITDRSpring19DR",
        "PhaseIITDRFall17GS",
        "PhaseIITDRFall17DR",
        "PhaseIISpr18AODMiniAOD",
        "TP2023HGCAL",
        "SHCAL2023Upg14",
        "TP2023SHCALDR",
        "PhaseIISummer17GenOnly",
        "Phase2HLTTDRWinter20RECOMiniAOD",
        "PhaseIIFall17D",
        "Muon2023Upg14DR",
        "RunIISummer15",
        "RunIISpring16reHLT80",
        "Upg2023SHCAL14"
    ],
    


    
    "Prompt":[
        "Prompt",
    ],
    
    "LHE":[
        "LHE",
    ],
    
    "Data challenges":[
        "CONTAINER"
    ],

    "Miscellaneous":[
        "CMSSW_1_3_2",
    ],
}



### List of all campaigns

In [9]:
campaign_list = []

for vals in categories.values():
    campaign_list = campaign_list + vals

### Map of campaigns to the category they belong to

In [10]:
cats_map = {}
for val in campaign_list: 
    for cat in categories:
        if val in categories[cat]:
            cats_map[val] = cat

### Map of datasets to the category they belong to

In [11]:
dataset_categories = []
for dataset in datasets:
    found = False
    for category in categories: 
        for campaign in categories[category]:
            if campaign in dataset and not found: 
                dataset_categories.append(category)
                found = True
    if not found:
        dataset_categories.append("Miscellaneous")

In [12]:
df['category'] = dataset_categories

In [13]:
for category in categories:
    isum = np.sum( df[ df["category"] == category]["size_on_tape_[TB]"] )
    print( "%30s : %12.1f" % ( category, isum ) )

                         Skims :       4063.3
       Commissioning + Cosmics :        634.3
                    Cosmics MC :          0.0
                         Run 3 :       8562.9
               Run 2 legacy MC :      25203.9
             Run 2 legacy data :      15986.4
         Run 2 pre-legacy data :       9441.4
           Run 2 pre-legacy MC :      46451.0
                     B-Parking :       3765.2
                           HIN :      10318.7
                          ALCA :         62.2
                          DPOA :      11698.7
                      Upgrades :       8055.3
                        Prompt :        321.3
                           LHE :          0.0
               Data challenges :          0.0
                 Miscellaneous :        241.1


## Decide which datasets to delete and keep

#### Specific exceptions

In [14]:
retained1 = np.in1d(datasets, exception_list)

In [15]:
keep = np.where( retained1 == True, True, keep )

#### Global exceptions

In [16]:
global_exceptions = {}

for campaign in campaign_list:
    global_exceptions[campaign] = []

In [17]:
campaign_tiers_to_keep = [
        ["USER", "USER"],                         #
        ["Commissioning08", "*"],                 #    
        ["Commissioning10", "*"],                 #
        ["Commissioning11", "*"],                 #
        ["Commissioning12", "RECO"],              #
        ["Commissioning2015", "*"],               # 
        ["Commissioning2017", "AOD"],             #
        ["Commissioning2018", "RAW-RECO"],        #
        ["Commissioning2021", "*"],                # 
        ["Commissioning2022", "*"],                # 
        ["Run3Summer19DRPremix", "*"],       #
        ["Run3Summer19DR", "*"],             #
        ["Run3Winter20", "*"],
        ["Run3Winter20DRPremixMiniAOD", "AODSIM"],#
        ["Run3Winter21DRMiniAOD", "AODSIM"],       # 
        ["Run3Summer21", "*"],                    #
        ["Run3Winter21", "*"],                    #
        ["Run3Summer21GS", "*"],                  #
        ["Run3Summer21PrePremix", "*"],           #
        ["Run3Summer21wmLHEGS", "GEN-SIM"],       #
        ["Run3Summer22", "*"],                    #    
        ["Summer20UL", "*"],                       # 
        ["Spring21UL", "*"],                       # 
        ["UL2016", "*"],                           # 
        ["UL2017", "*"],                           # 
        ["UL2018", "*"],                           #
        #["23Sep2016", "AOD"], 
        #["18Apr2017", "AOD"], 
        #["07Aug17", "AOD"], 
        #["12Sep2017", "AOD"], 
        #["17Nov2017", "AOD"], 
        #["09May2018", "AOD"], 
        #["06Jun2018", "AOD"], 
        #["17Sept2018", "AOD"], 
        #["22Jan2019", "AOD"], 
        #["Run2018D-PromptReco", "AOD"], 
        #["Run2018D-PromptReco", "AOD"],    
        ["RunIISpring15PrePremix", "PREMIX"],     #
        #["RunIIFall15DR76", "AODSIM"],            #
        #["RunIISummer16DR80", "AODSIM"],          #     
        ["RunIISummer16DR80Premix", "PREMIX"],     #
        #["RunIIFall17GS", "*"],                   #
        ["RunIIFall17DRPremix", "PREMIX"],         # 
        ["RunIISummer17PrePremix", "PREMIX"],      #
        ["RunIIFall17FSPremix", "PREMIX"],             #
        ["RunIIFall17FSPrePremix", "PREMIX"],          #
        #["RunIIFall18wmLHEGS", "*"],              #
        #["RunIIFall18GS", "*"],                   #
        ["RunIIAutumn18DRPremix", "PREMIX"],      # 
        #["RunIIAutumn18DR", "AODSIM"],            #
        ["RunIIAutumn18RECOBParking", "AODSIM"],  #
        #["RunIIAutumn18FSPremix", "*"],      #
        ["RunIIWinter19PFCalib17wmLHEGS", "LHE"], #
        ["RunIISpring18DRPremix", "PREMIX"],      #
        #["RunIIFall17DRStdmix", "*"],        #
        #["RunIISummer17DRStdmix", "*"],      #
        #["RunIISpring18DR", "AODSIM"],            #
        #["RunIISpring15DR74", "AODSIM"],          #
        #["RunIISummer16FSPremix", "*"],      #
        ["RunIISummer15wmLHEGS", "LHE"],          #
        #["RunIIWinter19CosmicDR", "*"],           #
        ["RunIIFall17wmLHEGS", "LHE"],            #    
        #["RunIIWinter19PFCalib16wmLHEGS", "*"],   #
        ["RunIIFall17wmLHEGENOnly", "LHE"],       #
        ["RunIIWinter15GS", "GEN-SIM"],           #
        ["SkimBPark", "AOD"],
        ["ParkingBPH", "AOD"],                     # 
        ["HIRun2010", "RECO"],                    # 
        ["HIRun2011", "AOD"],                     #
        ["HIRun2013", "AOD"],                     #
        ["HIRun2015", "AOD"],                     #
        ["PARun2016", "AOD"],                     #
        ["HIRun2018", "AOD"],                     #
        ["HINPbPbAutumn18DR", "AODSIM"],          # 
        ["HiWinter13", "AODSIM"],                 #
        ["pPb816Summer16DR", "AODSIM"],           #
        ["HINPbPbWinter16DR", "AODSIM"],          #
        ["HiFall13DR53X", "GEN-SIM-RECO"],        #    
        ["pAWinter13DR53X", "GEN-SIM-RECO"],      #
        ["XeXeRun2017", "*"],                     #
        ["HIRun2013A", "RECO"],                   #
        ["HINppWinter16DR", "AODSIM"],            #
        ["LowPU2010", "GEN-SIM"],                 #
        ["Run2010", "AOD"],                       # DPOA
        ["Run2010", "RAW-RECO"],                  # Keep cosmics
        ["Run2010", "RECO"],                      # Keep cosmics
        ["Run2011", "AOD"],                       #
        ["Run2011", "RAW-RECO"],                  #
        ["Run2011", "RECO"],                      #    
        ["Run2012", "AOD"],                       # DPOA
        ["Run2012", "RAW-RECO"],                  # Cosmics
        ["Summer11", "AODSIM"],                   #
        ["Summer11Leg", "GEN-SIM"],               #
        ["Fall11", "AODSIM"],                     #    
        ["Summer11LegDR", "AODSIM"],              #     
        ["Summer12DR53X", "AODSIM"],              #
        ["Summer11dr53X", "AODSIM"],              #
        ["Summer12_DR53X", "*"],                  #     
        ["Summer12", "AODSIM"],                   #
        ["Summer12", "GEN-SIM"],                  # 
        ["Summer13dr53X", "AODSIM"],              #
        ["Run2015C_25ns", "*"],                   #    
        ["Run2015", "AOD"],                       # DPOA
        ["Run2015", "RAW-RECO"],                  # Cosmics
        ["Phase2HLTTDRWinter20DIGI", "*"],        # Requested by TSG    
        ["Phase2HLTTDRSummer20ReRECOMiniAOD", "*"], #
        ["PhaseIITDRSpring17DR", "AOD"],          #
        ["PhaseIIFall16DR82", "AODSIM"],          #
        ["PhaseIISummer17wmLHEGENOnly", "LHE"],   # 
        ["GEM2019Upg14DR", "AODSIM"],             #
        ["PhaseISpring17DR", "AODSIM"],           # 
        ["PhaseIFall16DR", "AODSIM"],             #
        ["StreamExpressAlignment", "ALCARECO"],   #
        ["TkAlCosmicsInCollisions", "ALCARECO"],  #
        ["CMSSW_1_3_2", "RECO"],                  # What is this?
        ]

In [18]:
for ipair in campaign_tiers_to_keep :
    if ipair[1] not in global_exceptions[ipair[0]]:
        global_exceptions[ipair[0]].append(ipair[1])

In [19]:
datasets_strings = datasets.to_numpy(dtype="str")
tier_strings = tier.to_numpy(dtype="str")

### Go through all data tiers in the global exception list and keep those requested. 

In [20]:
for icampaign,itiers in global_exceptions.items():
    if "*" in itiers: 
        keep = np.where((np.core.defchararray.find(datasets_strings, icampaign) == -1),keep, True)
    else: 
        for itier in itiers: 
            found_campaign_in_dataset = (np.core.defchararray.find(datasets_strings, icampaign) != -1)
            tier_should_be_kept = np.where(tier == itier, True, False)
            keep = np.where(found_campaign_in_dataset & tier_should_be_kept,
                True, 
                keep)

# Current size breakdown

In [21]:
print('Total size               : %12.1f' % np.sum(size_on_tape))
print('Total size to keep       : %12.1f' % np.sum(size_on_tape[keep]))
print('Total size to delete     : %12.1f' % np.sum(size_on_tape[keep == False]))

Total size               :     144805.7
Total size to keep       :     100580.5
Total size to delete     :      44225.2


In [22]:
datasets_kept = datasets[keep]
datasets_categories_kept = np.full_like(datasets_kept, "")
size_on_tape_kept = size_on_tape[keep]

In [23]:
np.sum(size_on_tape_kept)

100580.48163

## Invalid exceptions

In [24]:
exception_valid = np.in1d(exception_list, datasets[keep == True])
exception_names = exception_list[exception_valid == False]

### Exceptions that don't seem to be on tape.

These are going to be safe and off of the "to delete" list, but just in case people are curious. 

In [25]:
for i in exception_names: 
    if "*" not in i : 
        print(i)

/SingleMuon/Run2016D-SiPixelCalSingleMuon-21Feb2020_UL2016_HIPM_WMass-v2/ALCARECO
/SingleMuon/Run2016H-SiPixelCalSingleMuon-21Feb2020_UL2016_WMass-v2/ALCARECO
/SingleMuon/Run2016H-SiPixelCalSingleMuon-21Feb2020_UL2016_WMass-v3/ALCARECO
/StreamExpress/Run2018A-TkAlMinBias-Express-v1/ALCARECO
/StreamExpress/Run2018B-TkAlMinBias-Express-v1/ALCARECO
/StreamExpress/Run2018C-TkAlMinBias-Express-v1/ALCARECO
/StreamExpress/Run2018D-TkAlMinBias-Express-v1/ALCARECO
/Cosmics/Commissioning2021-TkAlCosmics0T-CRUZETmkFit-v1/ALCARECO
/Cosmics/Commissioning2022-TkAlCosmics0T-PromptReco-v1/ALCARECO
/AlCaLumiPixels/Run2018B-AlCaPCCZeroBias-PromptReco-v3/ALCARECO
/AlCaLumiPixels/Run2018E-AlCaPCCZeroBias-PromptReco-v2/ALCARECO
/Cosmics/Commissioning2022-SiStripCalCosmics-PromptReco-v1/ALCARECO
/Cosmics/Commissioning2021-TkAlCosmics0T-CRUZETmkFit-v1/ALCARECO
/Cosmics/Commissioning2022-TkAlCosmics0T-PromptReco-v1/ALCARECO
/Cosmics/Commissioning2022-SiStripCalCosmics-PromptReco-v1/ALCARECO


# Make formatted lists to keep and drop

In [26]:
df_keep = df[keep == True]
df_drop = df[keep == False]

In [27]:
## Stupid delimeter. 
delim = ''.join( ['-' for i in range(75)])

# Keep list

In [28]:
print( "%30s : %12s | %12s | %12s" % ( "Category", "T0", "T1", "Total" ) )
print(delim)
for category in categories:
    isum_t0 = np.sum( df_keep[ (df_keep["category"] == category) & (df_keep["rse_id"] == "T0_CH_CERN_Tape")]["size_on_tape_[TB]"] )
    isum_t1 = np.sum( df_keep[ (df_keep["category"] == category) & (df_keep["rse_id"] != "T0_CH_CERN_Tape")]["size_on_tape_[TB]"] )
    print( "%30s : %12.1f | %12.1f | %12.1f" % ( category, isum_t0, isum_t1, isum_t0 + isum_t1 ) )
print(delim)
sum_t0 = np.sum( df_keep[(df_keep["rse_id"] == "T0_CH_CERN_Tape")]["size_on_tape_[TB]"] )
sum_t1 = np.sum( df_keep[(df_keep["rse_id"] != "T0_CH_CERN_Tape")]["size_on_tape_[TB]"] )
print( "%30s : %12.1f | %12.1f | %12.1f" % ( "Total", sum_t0, sum_t1, sum_t0 + sum_t1 ) )

                      Category :           T0 |           T1 |        Total
---------------------------------------------------------------------------
                         Skims :        371.3 |       3692.0 |       4063.3
       Commissioning + Cosmics :         17.8 |        616.5 |        634.3
                    Cosmics MC :          0.0 |          0.0 |          0.0
                         Run 3 :       3087.2 |       5475.7 |       8562.9
               Run 2 legacy MC :       8382.1 |      16698.5 |      25080.6
             Run 2 legacy data :       1362.2 |      14624.3 |      15986.4
         Run 2 pre-legacy data :        788.6 |       1330.6 |       2119.1
           Run 2 pre-legacy MC :       2442.9 |       8470.9 |      10913.8
                     B-Parking :       2871.9 |        893.3 |       3765.2
                           HIN :       1042.9 |       8865.3 |       9908.2
                          ALCA :         52.2 |         10.0 |         62.2
            

In [29]:
keeping_sorted = df_keep.sort_values(by="size_on_tape_[TB]", ascending=False)

In [30]:
keeping_sorted.get("dataset", "size_on_tape_[TB]")

0         /Neutrino_E-10_gun/RunIISummer17PrePremix-PUFu...
87401     /Neutrino_E-10_gun/RunIISummer17PrePremix-PUAu...
76074     /Neutrino_E-10_gun/RunIISummer20ULPrePremix-UL...
25367     /Neutrino_E-10_gun/RunIISummer17PrePremix-MCv2...
76075     /Neutrino_E-10_gun/RunIISummer17PrePremix-MCv2...
                                ...                        
51093     /CIToMuMu_Des_Lambda-9_M-800_TuneZ2star_8TeV-p...
51092     /CIToEE_Des_Lambda-9_M-500_TuneZ2star_8TeV-pyt...
51091     /CIToMuMu_Des_Lambda-15_M-500_TuneZ2star_8TeV-...
51090     /DarkMatter_Monophoton_M-500_VectorUNI_TuneZ2s...
118211    /HTMHT/Run2015A-LogErrorMonitor-27Jan2016-v1/USER
Name: dataset, Length: 68122, dtype: object

In [31]:
df_keep.to_csv("keeping_sorted_fall2022.csv")

# Drop list

In [32]:
print( "%30s : %12s | %12s | %12s" % ( "Category", "T0", "T1", "Total" ) )
print(delim)
for category in categories:
    isum_t0 = np.sum( df_drop[ (df_drop["category"] == category) & (df_drop["rse_id"] == "T0_CH_CERN_Tape")]["size_on_tape_[TB]"] )
    isum_t1 = np.sum( df_drop[ (df_drop["category"] == category) & (df_drop["rse_id"] != "T0_CH_CERN_Tape")]["size_on_tape_[TB]"] )
    print( "%30s : %12.1f | %12.1f | %12.1f" % ( category, isum_t0, isum_t1, isum_t0 + isum_t1 ) )
print(delim)
sum_t0 = np.sum( df_drop[(df_drop["rse_id"] == "T0_CH_CERN_Tape")]["size_on_tape_[TB]"] )
sum_t1 = np.sum( df_drop[(df_drop["rse_id"] != "T0_CH_CERN_Tape")]["size_on_tape_[TB]"] )
print( "%30s : %12.1f | %12.1f | %12.1f" % ( "Total", sum_t0, sum_t1, sum_t0 + sum_t1 ) )    

                      Category :           T0 |           T1 |        Total
---------------------------------------------------------------------------
                         Skims :          0.0 |          0.0 |          0.0
       Commissioning + Cosmics :          0.0 |          0.0 |          0.0
                    Cosmics MC :          0.0 |          0.0 |          0.0
                         Run 3 :          0.0 |          0.0 |          0.0
               Run 2 legacy MC :         47.4 |         75.9 |        123.3
             Run 2 legacy data :          0.0 |          0.0 |          0.0
         Run 2 pre-legacy data :        997.0 |       6325.3 |       7322.3
           Run 2 pre-legacy MC :      10189.1 |      25348.1 |      35537.2
                     B-Parking :          0.0 |          0.0 |          0.0
                           HIN :         16.2 |        394.3 |        410.5
                          ALCA :          0.0 |          0.0 |          0.0
            

In [33]:
dropping_sorted = df_drop.sort_values(by="size_on_tape_[TB]", ascending=False)
dropping_sorted_values = dropping_sorted["dataset"]

In [34]:
df_drop.to_csv("dropping_sorted_fall2022.csv")

In [35]:
dropping_sorted

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,dataset,rse_id,DTier,size_on_tape_[TB],category
70247,101845,101845,/PMSSM_set_2_LL_2_TuneCP2_13TeV-pythia8/RunIIF...,T1_RU_JINR_Tape,AODSIM,340.05288,Run 2 pre-legacy MC
7,8,8,/PMSSM_set_2_prompt_2_TuneCP2_13TeV-pythia8/Ru...,T0_CH_CERN_Tape,AODSIM,330.96930,Run 2 pre-legacy MC
25371,35918,35918,/PMSSM_set_2_LL_1_TuneCP2_13TeV-pythia8/RunIIF...,T1_DE_KIT_Tape,AODSIM,314.78499,Run 2 pre-legacy MC
87406,125154,125154,/PMSSM_set_2_prompt_1_TuneCP2_13TeV-pythia8/Ru...,T1_US_FNAL_Tape,AODSIM,308.53333,Run 2 pre-legacy MC
87410,125161,125161,/QCD_Pt-15to7000_TuneCP5_Flat2017_13TeV_pythia...,T1_US_FNAL_Tape,GEN-SIM-RECO,278.70214,Run 2 pre-legacy MC
...,...,...,...,...,...,...,...
87387,125045,125045,/HighMultiplicityEOF/Run2016B-07Aug17_ver1-v1/AOD,T1_UK_RAL_Tape,AOD,0.00000,Run 2 pre-legacy data
87358,124856,124856,/HLTPhysics4/Run2018A-06Jun2018-v1/AOD,T1_UK_RAL_Tape,AOD,0.00000,Run 2 pre-legacy data
87355,124837,124837,/HighMultiplicity85EOF/Run2016B-18Apr2017_ver1...,T1_UK_RAL_Tape,AOD,0.00000,Run 2 pre-legacy data
70169,101252,101252,/HLTPhysics3/Run2018A-06Jun2018-v1/AOD,T1_IT_CNAF_Tape,AOD,0.00000,Run 2 pre-legacy data


# Total to remove from T0

In [36]:
np.sum( dropping_sorted["size_on_tape_[TB]"][dropping_sorted["rse_id"] == "T0_CH_CERN_Tape"] )

11321.754850000001

# Total to remove from T1

In [37]:
np.sum( dropping_sorted["size_on_tape_[TB]"] ) - np.sum( dropping_sorted["size_on_tape_[TB]"][dropping_sorted["rse_id"] == "T0_CH_CERN_Tape"] )

32903.48513

## DPOA Data

In [38]:
vals_dpoa = keeping_sorted["size_on_tape_[TB]"][keeping_sorted["category"] == "DPOA"][keeping_sorted["rse_id"] == "T0_CH_CERN_Tape"]

In [39]:
np.sum( vals_dpoa )

708.48172

## Sanity check of exceptions

List of exceptions should not be in the list to drop. The intersection of these sets should be empty. 

In [40]:
set(exception_list) & set(dropping_sorted_values)

set()