# CAMMAC https://cammac.readthedocs.io
S.Sénési for Météo-France - sept 2019 to march 2021

# Checking local data vs ESGF published data (using a data_versions dict and its ancillary dicts)

In [None]:
from IPython.core.display import display, HTML, Image
display(HTML("<style>.container { width:100% !important; }</style>"))
import requests  # use pip or conda to install it if needed
import json
from climaf import period

In [None]:
def jrequest(q,node="esgf-data.dkrz.de") :  
    #"esgf-node.ipsl.upmc.fr"    #"esgf-data.dkrz.de"    #"esgf-node.jpl.nasa.gov"
    form="&format=application%2Fsolr%2Bjson"
    reqs="http://%s/esg-search/search?%s%s"%(node,q,form)
    #print reqs
    return requests.get(reqs).json()

## List all models which run an experiment

In [None]:
def esgf_models_for_experiment(variable,table,experiment="piControl"):
    rep=jrequest('distrib=true&experiment_id=%s&variable=%s&table_id=%s&limit=10000&fields=source_id'%(experiment,variable,table))
    docs=rep['response']['docs']
    models=set()
    for e in docs :
        models.add(e['source_id'][0])
    return sorted(list(models))

## Returns published_period(s), for a model, an experiment and a variable

In [None]:
def published_period(model,experiment,variable,table) :
    dic={'distrib'     : 'true',
         'limit'       : '10000',
         'type'        : 'File',
         'fields'      : 'title,instance_id',
         'experiment_id':experiment,
         'source_id'   : model,
         'variable'    : variable,
         'table_id'    : table,
         'latest'      : 'true',
         #'id'     : "*"+version+"*",
         }
    #
    # Form request string
    reqs=""
    for k in dic : reqs+="%s=%s&"%(k,dic[k])
    reqs=reqs[0:-1]
    #
    rep=jrequest(reqs)
    #return rep
    docs=rep['response']['docs']
    if len(docs)==0 : 
        return ([])
    #None
    #
    # Analyze response to gather periods among all files and shards
    periods=dict()
    for e in docs :
        #print "%-30s %s"%(e['data_node'],e['title'])
        instance_id=e['instance_id'].encode('ascii')
        realization=instance_id.split(".")[5]
        version=instance_id.split(".")[9]
        #
        filename=e['title'].encode('ascii')
        #if "2569" in filename : print "got one:",filename,instance_id
        file_period=filename.split("_")[-1].replace(".nc","")
        if (realization,version) not in periods :
            periods[(realization,version)]=set()
        periods[(realization,version)].add(file_period)
    for pair in periods :
        ps=list(periods[pair])
        ps.sort()
        if len(ps) > 900 : ps=ps[0:900] # Isseu with some models ...
        periods[pair]=period.merge_periods([ period.init_period(p) for p in ps ],handle_360_days_year=True)
    return periods

## Check length of published periods for a variable, an experiment and all models (for all versions/grid/)

In [None]:
def experiment_length_check(variable,table,experiment="piControl") :
    #print variable,table
    ok = []
    nok=[]
    models_ok=dict()
    models_nok=dict()
    #
    if experiment=="piControl" :
        length=500
    elif experiment=="historical":
        length=165
    elif experiment[0:3] =="ssp":
        length=86
    #
    for model in esgf_models_for_experiment(variable,table,experiment) :
        periods=published_period(model,experiment,variable,table)
        for pair in periods :
            for aperiod in periods[pair]:
                if aperiod.end.year - aperiod.start.year +1 < length :
                    nok.append((model,pair,aperiod))
                    if model not in models_nok :
                        models_nok[model]=[]
                    models_nok[model].append([pair,str(aperiod)])
                else :
                    ok.append((model,pair,aperiod))
                    if model not in models_ok :
                        models_ok[model]=[]
                    models_ok[model].append([pair,str(aperiod)])
    return models_ok,models_nok

## Check, for one experiment, if some published data are either uncomplete on ESGF, or missing in a data_versions directory (built from file system content on the local machine)

In [None]:
def check_versions_dict(experiment,versions_tag,var_tables=None): 
    print 90*"_"
    print
    print "Data published for %s vs data on local machine according to data_versions dict %s"%(experiment,versions_tag)
    print 90*"_"
    oks=dict()
    noks=dict()
    models_not_recorded=dict()
    models_with_holes=dict()
    with open("Data_versions_selection_%s.json"%versions_tag,"r") as f :
        versions_dic=json.load(f)
    with open("Data_versions_selection_%s_holes.json"%versions_tag,"r") as f :
        holes=json.load(f)
    with open("Data_versions_selection_%s_resolve.json"%versions_tag,"r") as f :
        coverage_issues=json.load(f)
    for variable in versions_dic[experiment]:
        for table in versions_dic[experiment][variable] :
            pair=variable+","+table
            if var_tables is None or (variable,table) in var_tables :
                oks[pair],noks[pair]=experiment_length_check(variable,table,experiment)
                print "\n\n",experiment, variable, table," : ", len(oks[pair])," models have published data OK, ", len(noks[pair])," models have NOK published data",
                count=len([m for m in oks[pair] if m not in versions_dic[experiment][variable][table]])
                print " and %d models have issue on local machine\n"%count
                for model in noks[pair] :
                    count=0
                    print "\t%-10s %-5s %-20s has NOK published data"%(variable, table, model), 
                    for i in noks[pair][model] :
                        count+=1
                        (r,v),p=i
                        if count < 5 :
                            print r,p,", ",
                        else:
                            print ".",
                    print
                print
                for model in oks[pair] :
                    if model not in versions_dic[experiment][variable][table]:
                        (r,v),p=oks[pair][model][0]
                        # Check if it is idenitifed as having holes in versions dic companion
                        try :
                            h=holes[experiment][variable][table][model]
                            reals=h[h.keys()[0]]
                            version=reals[r]
                            print "\t%-10s %-5s %-20s has holes on local machine"%(variable, table, model), oks[pair][model][0]
                        except :
                            try : 
                                h=coverage_issues[experiment][variable][table][model]
                                reals=h[h.keys()[0]]
                                cov_issue=reals[r][v][v]
                                print "\t%-10s %-5s %-20s  has incomplete_coverage on local machine"%(variable, table, model), oks[pair][model][0]," vs. ",cov_issue
                            except:
                                print "\t%-10s %-5s %-20s is missing on local machine"%(variable, table, model), oks[pair][model]
                        if model not in models_not_recorded :
                            models_not_recorded[model]=[]
                        models_not_recorded[model].append(pair)
    #
    print "\n\nSummary of variable(s) with issues for ",experiment
    for model in models_not_recorded :
        print "\t%-20s"%model, 
        for variable_table in models_not_recorded[model]:
            if "day" in variable_table : print variable_table,
            else: print variable_table.split(",")[0],
        print
    return oks,noks

In [None]:
if False :
    pairs=[("pr","Amon")] 
    experiment="piControl"
    versions_tag="20200720"
    a=check_versions_dict(experiment,versions_tag,pairs)

## List all kind of consisteny issues between ESGF and local data according to a data_versions dict

In [None]:
def check_hydro_variables_all_experiments(versions_tag="2020918",date_label="20200918"):
    pairs=[("tas","Amon"),("pr","Amon"),("evspsbl","Amon"),("prw","Amon"),("mrro","Lmon"),("mrso","Lmon"),("pr","day")] 
    experiments=["piControl","historical","ssp119","ssp126","ssp245","ssp585"]
    #pairs=[("tas","Amon")]
    #experiments=["ssp119"]
    #
    oksnoks=dict()
    for experiment in experiments :
        oksnoks[experiment]=check_versions_dict(experiment,versions_tag,pairs)
        oksnoksf="esgf_vs_local_data_%s_%s_%s.json"%(experiment,date_label,versions_tag)
        with open(oksnoksf,"w") as f :
            json.dump(oksnoks[experiment],f,separators=(',', ': '),indent=3,ensure_ascii=True)
    return oksnoks

In [None]:
on=check_hydro_variables_all_experiments("20200719","test")

In [None]:
on=check_hydro_variables_all_experiments("20200913","20200913")

In [None]:
on=check_hydro_variables_all_experiments("20200918","20200918")

## Summarizing missing/uncomplete published variables per model for each experiment (according to ESGF)

In [None]:
def esgf_missing_variables(oksnoks):
    print "Missing or uncomplete variables on the ESGF (among those scrutinized locally)"
    if type(oksnoks===tuple) :
        oksnoks={"unknown" :oksnoks}
            
    for experiment in oksnoks:
        print experiment
        oks,noks=oksnoks[experiment]
        all_models=set()
        for pair in oks : 
            for model in oks[pair]  : all_models.add(model)
            for model in noks[pair] : all_models.add(model)
        all_models=list(all_models)
        all_models.sort()
        #
        #moks=dict()
        #for pair in oks :
        #    for model in oks[pair] :
        #        if model not in moks : moks[model]=[]
        #        moks[model].append(pair)
        mnoks=dict()
        for pair in noks :
            for model in noks[pair] :
                if model not in mnoks : mnoks[model]=[]
                mnoks[model].append(pair)
        for model in all_models :
            if model in mnoks :
                print "%-20s "%model,
                for var,tab in mnoks[model] :
                    if tab != "day" : print var,
                    else : print "%s_%s"%(var,tab),
                print
        

In [None]:
#esgf_missing_variables(on)

In [None]:
models_having_one_missing_var=set()
variables=[("tas","Amon"),("pr","Amon"),("evspsbl","Amon"),("prw","Amon"),("mrro","Lmon"),("mrso","Lmon"),("pr","day")] 

for pair in variables:
    variable,table=pair
    for model in noks[pair] :
        models_having_one_missing_var.add(model)

#print "\nModels having one + missing var"
#for m in models_having_one_missing_var:
#    print "\t",m

# listing models which have all vars missing
models_having_all_vars_missing=models_having_one_missing_var.copy()
for m in models_having_one_missing_var :
    for p in oks : 
        if m in oks[p] :
            if m in models_having_all_vars_missing :
                models_having_all_vars_missing.remove(m)

print "Models having all vars missing"
for m in models_having_all_vars_missing:
    print "\t",m


print "\nDelta list of models having some vars missing"
for pair in variables:
    variable,table=pair
    print "\n%-10s %4s"%(variable,table)
    for model in noks[pair] :
        if model in oks[pair] :
            continue
        if model in models_having_all_vars_missing :
            continue
        print "\t%-25s"%model,
        for opair,period in noks[pair][model] :
            real,version = opair
            print real,version,period,"|",
        print 
        #models_having_one_missing_var.add(model)


        
print "\nFull list of models having some vars missing"
for pair in variables:
    variable,table=pair
    print "\n%-10s %4s"%(variable,table)
    for model in noks[pair] :
        if model in oks[pair] :
            continue
        print "\t%-25s"%model,
        for opair,period in noks[pair][model] :
            real,version = opair
            print real,version,period,"|",
        print 
        #models_having_one_missing_var.add(model)




all_models=set()
for p in oks : 
    for m in oks[p] :
        all_models.add(m)
for p in noks : 
    for m in noks[p] :
        all_models.add(m)

    


In [None]:
def check_if_version_is_last_published(model,experiment,variable,table,grid,real,version,distrib="true") :
    dic={'distrib'     : distrib,
         'limit'       : '10000',
         'type'        : 'Dataset',
         'fields'      : 'version',
         'experiment_id':experiment,
         'source_id'   : model,
         'variable'    : variable,
         'table_id'    : table,
         'variant_label': real,
         'latest'     : 'true'
         }
    # Form request string
    reqs=""
    for k in dic : reqs+="%s=%s&"%(k,dic[k])
    reqs=reqs[0:-1]
    #
    #print reqs
    rep=jrequest(reqs)
    docs=rep['response']['docs']
    if len(docs)==0 : 
        return None
    #
    rep=True
    for e in docs :
        pversion=e['version'].encode('ascii')
        if version[0]=='v' : version=version[1:]
        if version != pversion: 
            # Don't worry for out-dated index info
            if int(version) < int(pversion) : 
                rep = pversion
    return rep
