# Use inbuilt Pandas tools for filtering the DataFrame catalogue
- rather than the loops etc

In [1]:
import baspy as bp
import pandas as pd

In [2]:
catlg = bp.catalogue(dataset='cmip5')

Updating cached catalogue...
>> Current cached values (can be extended by specifying additional values or by setting read_everything=True) <<
{'Experiment': ['rcp26', 'piControl', 'rcp85', 'historical', 'rcp45'], 'Frequency': ['mon']}

No user values defined, will therefore filter catalogue using default values


In [61]:
# create a unique identifier for each model-run-version
catlg['Model-RunID-Version'] = catlg['Model'] +'-'+ catlg['RunID'] +'-'+ catlg['Version']

In [62]:
# this is the dictionary we want to use to filt our original catalogue
filt_dict = {'Var':['bmelt','evap'], 'Experiment':['rcp26']}

In [63]:
nkeys = len(filt_dict.keys())
nVars = len(filt_dict['Var'])

In [64]:
catlg.iloc[0:3]

Unnamed: 0,Centre,Model,Experiment,Frequency,SubModel,CMOR,RunID,Version,Var,StartDate,EndDate,Path,DataFiles,dataset,Model-RunID-Version
248,BCC,bcc-csm1-1-m,rcp26,mon,seaIce,OImon,r1i1p1,v20120910,bmelt,200601,210012,/BCC/bcc-csm1-1-m/rcp26/mon/seaIce/OImon/r1i1p...,bmelt_OImon_bcc-csm1-1-m_rcp26_r1i1p1_200601-2...,cmip5,bcc-csm1-1-m-r1i1p1-v20120910
249,BCC,bcc-csm1-1-m,rcp26,mon,seaIce,OImon,r1i1p1,v20120910,evap,200601,210012,/BCC/bcc-csm1-1-m/rcp26/mon/seaIce/OImon/r1i1p...,evap_OImon_bcc-csm1-1-m_rcp26_r1i1p1_200601-21...,cmip5,bcc-csm1-1-m-r1i1p1-v20120910
250,BCC,bcc-csm1-1-m,rcp26,mon,seaIce,OImon,r1i1p1,v20120910,ialb,200601,210012,/BCC/bcc-csm1-1-m/rcp26/mon/seaIce/OImon/r1i1p...,ialb_OImon_bcc-csm1-1-m_rcp26_r1i1p1_200601-21...,cmip5,bcc-csm1-1-m-r1i1p1-v20120910


In [65]:
# Using isin to find all elements that are True - we want those rows where ALL the dict_keys are True
catlg.isin(filt_dict).iloc[0:3]

Unnamed: 0,Centre,Model,Experiment,Frequency,SubModel,CMOR,RunID,Version,Var,StartDate,EndDate,Path,DataFiles,dataset,Model-RunID-Version
248,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False
249,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False
250,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False


In [81]:
a = catlg.isin(filt_dict)
my_catlg = catlg[ (a.sum(axis=1) == nkeys) ]
my_catlg.iloc[0:3]

Unnamed: 0,Centre,Model,Experiment,Frequency,SubModel,CMOR,RunID,Version,Var,StartDate,EndDate,Path,DataFiles,dataset,Model-RunID-Version
248,BCC,bcc-csm1-1-m,rcp26,mon,seaIce,OImon,r1i1p1,v20120910,bmelt,200601,210012,/BCC/bcc-csm1-1-m/rcp26/mon/seaIce/OImon/r1i1p...,bmelt_OImon_bcc-csm1-1-m_rcp26_r1i1p1_200601-2...,cmip5,bcc-csm1-1-m-r1i1p1-v20120910
249,BCC,bcc-csm1-1-m,rcp26,mon,seaIce,OImon,r1i1p1,v20120910,evap,200601,210012,/BCC/bcc-csm1-1-m/rcp26/mon/seaIce/OImon/r1i1p...,evap_OImon_bcc-csm1-1-m_rcp26_r1i1p1_200601-21...,cmip5,bcc-csm1-1-m-r1i1p1-v20120910
28823,BCC,bcc-csm1-1,rcp26,mon,seaIce,OImon,r1i1p1,v20120705,bmelt,200601,230012,/BCC/bcc-csm1-1/rcp26/mon/seaIce/OImon/r1i1p1/...,bmelt_OImon_bcc-csm1-1_rcp26_r1i1p1_200601-209...,cmip5,bcc-csm1-1-r1i1p1-v20120705


# Ensure we have a complete set of variables for each Model-Run-Version
- if number Vars > 1

In [67]:
# number of Vars in each model-run-version group
my_catlg_gp = my_catlg.groupby(['Model-RunID-Version']).count().max(axis=1)
my_catlg_gp

Model-RunID-Version
BNU-ESM-r1i1p1-v20120503           2
CCSM4-r1i1p1-v20121031             2
CCSM4-r2i1p1-v20120411             2
CCSM4-r3i1p1-v20120411             2
CCSM4-r4i1p1-v20120411             2
CCSM4-r5i1p1-v20120411             2
CCSM4-r6i1p1-v20120411             2
CESM1-CAM5-r1i1p1-v20130302        2
CESM1-CAM5-r2i1p1-v20130302        2
CESM1-CAM5-r3i1p1-v20130302        2
CESM1-WACCM-r2i1p1-v20130314       2
CESM1-WACCM-r3i1p1-v20130314       2
CESM1-WACCM-r4i1p1-v20130314       2
CNRM-CM5-r1i1p1-v20130101          2
FGOALS-g2-r1i1p1-v2                2
FIO-ESM-r1i1p1-v20120522           2
FIO-ESM-r2i1p1-v20120522           2
FIO-ESM-r3i1p1-v20120522           2
GISS-E2-H-r1i1p1-v20120810         1
GISS-E2-H-r1i1p2-v20120822         1
GISS-E2-H-r1i1p3-v20120906         1
GISS-E2-R-r1i1p1-v20130731         1
GISS-E2-R-r1i1p2-v20121016         1
GISS-E2-R-r1i1p3-v20121016         1
IPSL-CM5A-LR-r1i1p1-v20120114      2
IPSL-CM5A-LR-r2i1p1-v20120114      2
IPSL-CM5A-LR-r3i1p

In [70]:
my_catlg_gp = my_catlg_gp[ my_catlg_gp == nVars ]
my_catlg_gp

Model-RunID-Version
BNU-ESM-r1i1p1-v20120503         2
CCSM4-r1i1p1-v20121031           2
CCSM4-r2i1p1-v20120411           2
CCSM4-r3i1p1-v20120411           2
CCSM4-r4i1p1-v20120411           2
CCSM4-r5i1p1-v20120411           2
CCSM4-r6i1p1-v20120411           2
CESM1-CAM5-r1i1p1-v20130302      2
CESM1-CAM5-r2i1p1-v20130302      2
CESM1-CAM5-r3i1p1-v20130302      2
CESM1-WACCM-r2i1p1-v20130314     2
CESM1-WACCM-r3i1p1-v20130314     2
CESM1-WACCM-r4i1p1-v20130314     2
CNRM-CM5-r1i1p1-v20130101        2
FGOALS-g2-r1i1p1-v2              2
FIO-ESM-r1i1p1-v20120522         2
FIO-ESM-r2i1p1-v20120522         2
FIO-ESM-r3i1p1-v20120522         2
IPSL-CM5A-LR-r1i1p1-v20120114    2
IPSL-CM5A-LR-r2i1p1-v20120114    2
IPSL-CM5A-LR-r3i1p1-v20111119    2
IPSL-CM5A-LR-r4i1p1-v20120804    2
IPSL-CM5A-MR-r1i1p1-v20111119    2
MIROC5-r1i1p1-v20120710          2
MIROC5-r2i1p1-v20120710          2
MIROC5-r3i1p1-v20120710          2
MRI-CGCM3-r1i1p1-v20120701       2
NorESM1-M-r1i1p1-v20120227       2


In [79]:
complete_catlg = catlg[ catlg.isin({'Model-RunID-Version':my_catlg_gp.index})['Model-RunID-Version'] == True ]

In [80]:
len(catlg), len(complete_catlg)

(77784, 9051)