# Identify the latest version of each run

In [15]:
import numpy as np
import baspy as bp
import pandas as pd

In [16]:
### use experimental catalogue - which ensures a complete set of variables as standard
catlg = bp.catalogue(dataset='cmip5_all_versions', Var=['tas','pr'], Frequency='day')

Updating cached catalogue...
catalogue memory usage (MB): 76.35629899999999
>> Current cached values (can be extended by specifying additional values or by setting read_everything=True) <<
{'Experiment': ['rcp26', 'rcp85', 'piControl', 'historical', 'rcp45'], 'Frequency': ['mon', 'day']}

More than one Var specified, consider setting complete_var_set=True


In [17]:
catlg = catlg[catlg['Version'] != 'latest'] # why do these Version ids exist?? !!

if 'cmip5' not in catlg['dataset'].iloc[0]:
    raise ValueError('currently this only works for CMIP5 cataloging')

In [18]:
### convert Version strings into integers (e.g., 'v20110101' --> 20110101)
catlg['Version'] = catlg['Version'].map(lambda x: x.lstrip('v')).astype(int)

In [19]:
### add column of Version numbers we can tweak and sort
catlg['VersionToSort'] = catlg['Version']

In [20]:
### tweak version numbers ready to sort
# models for which numeric version numbers are later than date-like version numbers
numeric_ver_later_models=np.array(['ACCESS1-0','ACCESS1-3','CSIRO-Mk3-6-0'])
for m in numeric_ver_later_models:
    catlg.loc[ (catlg['Model'] == m) & (catlg['Version'] < 2000_00_00), 'VersionToSort' ] += 1_000_000_000

pd.unique(catlg['VersionToSort'])

array([  20120910,   20120709,   20130411,   20120719,   20120724,
         20120705,          1,   20110101,   20120503,   20120504,
         20120612,   20120207,          2,   20120410,   20110829,
         20120209,   20120407,   20110901,   20130417,   20120730,
         20130416,   20120907,   20121023,   20120528,   20120514,
         20120518,   20120330,   20120717,   20110701,   20120426,
         20121001,   20110905,   20120702,   20120703,   20111206,
         20120530,   20110629,   20110819, 1000000002, 1000000004,
       1000000001,   20120607,   20120713,   20120319,   20110913,
         20120318,   20130315,   20131219,   20121212,   20131231,
         20130325,   20131126,   20130404,   20130311,   20130212,
         20121114,   20121213,   20121216,   20130108,   20130314,
         20130215,   20130104,   20120609,   20130218,   20120924,
         20120629,   20120516,   20120213,   20110323,   20120526,
         20111119,   20110915,   20110406,   20120114,   20120

In [21]:
### Get rows with max Version ID where all vars present (use observed=True as we have 'category' columns)
latest_versions = catlg.groupby(['Model', 'Experiment', 'RunID'], observed=True).max()['VersionToSort'].reset_index() 
latest_versions

Unnamed: 0,Model,Experiment,RunID,VersionToSort
0,bcc-csm1-1-m,rcp26,r1i1p1,20120910
1,bcc-csm1-1-m,historical,r1i1p1,20120709
2,bcc-csm1-1-m,historical,r3i1p1,20120709
3,bcc-csm1-1-m,historical,r2i1p1,20120709
4,bcc-csm1-1-m,rcp85,r1i1p1,20130411
5,bcc-csm1-1-m,rcp45,r1i1p1,20120724
6,bcc-csm1-1-m,piControl,r1i1p1,20120705
7,bcc-csm1-1,rcp26,r1i1p1,20120705
8,bcc-csm1-1,historical,r1i1p1,1
9,bcc-csm1-1,historical,r3i1p1,1


In [22]:
latest_versions['VersionToSort']            = latest_versions['VersionToSort'].astype(str)
latest_versions['Unique_Model_Run_Version'] = latest_versions[['Model', 
                                                               'Experiment', 
                                                               'RunID', 
                                                               'VersionToSort']].apply(lambda x: '_'.join(x), axis=1)

catlg['VersionToSort']                      = catlg['VersionToSort'].astype(str)
catlg['Unique_Model_Run_Version']           = catlg[['Model', 'Experiment', 'RunID', 'VersionToSort']].apply(lambda x: '_'.join(x), axis=1)

catlg = catlg[  catlg['Unique_Model_Run_Version'].isin(latest_versions['Unique_Model_Run_Version']) ]

In [23]:
### Clean-up
catlg = catlg.drop( labels=['Unique_Model_Run_Version','VersionToSort'], axis=1 )

# Work out which Versions have been completely superseded by a newer version

to do...