# List ESGF latest version for a series of variables and check them on /bdd/CMIP6

##  Define lists of interesting variables and experiments

In [None]:
experiments=["piControl","historical","ssp126","ssp245","ssp585","ssp119"]
#experiments=["ssp126"]#,"ssp245","ssp585","ssp119"]

variables={
    "Amon": ["pr","tas","prw","evspsbl"], 
    "Lmon": ["mrro","mrso","mrsos"],
    "Omon": ["sos"],
    "day" : ["pr"]
    }

CAMMAC             = "/home/ssenesi/CAMMAC"

node="esgf-data.dkrz.de" 

do_test=True

institutions=dict()
activities=dict()

In [None]:
if do_test :
    experiments=["historical"]
    experiments=["ssp245"]
    #experiments=["piControl"]
    variables={"Amon": ["pr"]}

In [None]:
from IPython.core.display import display, HTML, Image
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import requests  # use pip or conda to install it if needed
import json
import glob
import re
import os
if not os.path.exists("timestamps"):
    os.mkdir("timestamps")
from climaf import period
sys.path.append(CAMMAC    ) 
from CAMMAClib.ancillary   import feed_dic, amail

## A basic function for querying the ESGF

In [None]:
def jrequest(q,node=node) :  
    #"esgf-node.ipsl.upmc.fr"    #"esgf-data.dkrz.de"    #"esgf-node.jpl.nasa.gov"
    form="&format=application%2Fsolr%2Bjson"
    reqs="http://%s/esg-search/search?%s%s"%(node,q,form)
    #print reqs
    return requests.get(reqs).json()

In [None]:
def published_period(model,experiment,variable,table,variant=None,version=None,no_published_files=[]) :
    dic={'distrib'     : 'true',
         'limit'       : '10000',
         'type'        : 'File',
         'fields'      : 'title,instance_id,variant_label,version',
         'experiment_id':experiment,
         'source_id'   : model,
         'variable'    : variable,
         'table_id'    : table,
         'latest'      : 'true',
         'replica'     : 'false',
         #'id'     : "*"+version+"*",
         }
    #
    # Form request string
    reqs=""
    for k in dic : reqs+="%s=%s&"%(k,dic[k])
    reqs=reqs[0:-1]
    #
    rep=jrequest(reqs)
    #return rep
    docs=rep['response']['docs']
    if len(docs)==0 : 
        return ([])
    #None
    #
    # Analyze response to gather periods among all files and shards
    periods=dict()
    for e in docs :
        #print "%-30s %s"%(e['data_node'],e['title'])
        instance_id=e['instance_id'].encode('ascii')
        #realization=instance_id.split(".")[5]
        #version=instance_id.split(".")[9]
        realization=e['variant_label'][0].encode('ascii')
        version=e['version'][0].encode('ascii')
        
        #
        filename=e['title'].encode('ascii')
        #if "2569" in filename : print "got one:",filename,instance_id
        file_period=filename.split("_")[-1].replace(".nc","")
        if (realization,version) not in periods :
            periods[(realization,version)]=set()
        periods[(realization,version)].add(file_period)
    for pair in periods :
        ps=list(periods[pair])
        ps.sort()
        if len(ps) > 900 : ps=ps[0:900] # Issue with some models ...
        periods[pair]=period.merge_periods([ period.init_period(p) for p in ps ],handle_360_days_year=True)
    if variant is None :
        return periods
    else :
        #if model=="INM-CM5-0" :
        #    print periods
        if (variant,version) in periods :
            return periods[(variant,version)]
        else:
            print("no file for dataset %s %s %s %s %s %s "%(model,experiment,table,variable,variant,version))
            print periods.keys()
            #no_published_files.append((model,experiment,variable,table,variant,version))
            return []
        

In [None]:
def check_period_length(experiment,filenames=[],periods=None):
    #
    lengths={ "piControl" :200, "historical":165, "ssp119":86, "ssp126":86, "ssp245":86, "ssp585":86 }
    if periods is None :
        if len(filenames)==0 :
            return False
        else :
            periods=set()
            for filename in filenames :
                file_period=filename.split("_")[-1].replace(".nc","")
                periods.add(file_period)
            ps=list(periods)
            ps.sort()
            if len(ps) > 900 : ps=ps[0:900] # Isseu with some models ...
            periods=period.merge_periods([ period.init_period(p) for p in ps ],handle_360_days_year=True)
    #
    try :
        aperiod=periods[0]
    except :
        # print "Issue with ",periods, 
        #if filenames != [] : print filenames[0].split("/")[-1]
        #else: print
        #raise Error("")
        return False
    length = aperiod.end.year - aperiod.start.year +1
    #print "exp",experiment,"length",length
    ok = length >= lengths[experiment]
    if not ok :
        if len(periods) > 2 : 
            periods=periods[0:2]+["..."]
        if filenames != [] :
            fnames = filenames[0].split("/")[-1]
        else :
            fnames = ""
        #print "Not enough data for ",fnames, periods
    #else : 
    #    print "Fine for ",filenames[0].split("/")[-1]
    return ok

## Function for comparing ESGF content and /bdd content for a variable + experiment
### It queries the ESGF using replica=false and distrib=true
### It returns two lists : one for ESGF datasets which are present on /bdd, and one for those which are missing 
### The first list is also organized as a multi-level dict with variant and version as last key levels

In [None]:
def list_datasets(experiment,variable,table,distrib="true",limit=10000, already_checked_OK=[],published_NOK=[],published_OK=[]) :
    #
    ok=[]
    nok=[]
    dok=dict()
    #
    # Form ESGF API request string
    dic={'distrib'     : distrib,
         'limit'       : str(limit),
         'type'        : 'Dataset',
         'fields'      : 'activity_id,institution_id,source_id,variant_label,grid_label,version,instance_id',
         'experiment_id':experiment,
         'variable'    : variable,
         'table_id'    : table,
         'latest'     : 'true',
         'replica'     : 'false'
         }
    reqs=""
    for k in dic : reqs+="%s=%s&"%(k,dic[k])
    reqs=reqs[0:-1]
    #
    #print reqs
    req=jrequest(reqs)
    docs=req['response']['docs']
    if len(docs)==0 : 
        return ok,dok,nok
    #
    #
    count_missing=0
    count_ok=0
    no_published_files=[]
    for e in docs :
        model=e['source_id'][0].encode('ascii')
        variant=e['variant_label'][0].encode('ascii')
        grid=e['grid_label'][0].encode('ascii')
        version=e['version'].encode('ascii')
        instance_id=e['instance_id']#.encode('ascii')
        institution_id=e['institution_id'][0].encode('ascii')
        activity_id=e['activity_id'][0].encode('ascii')
        nuple=(activity_id,institution_id,model,experiment,variant,table,variable,grid,version)
        activities[experiment]=activity_id
        institutions[model]=institution_id
        #
        if list(nuple) not in published_OK :
            if list(nuple) not in published_NOK:
                pub_period=published_period(model,experiment,variable,table,variant,version,no_published_files)
                pub_ok=check_period_length(experiment,periods=pub_period)
                if pub_ok :
                    published_OK.append(list(nuple))
                else :
                    published_NOK.append(list(nuple))
            else :
                pub_ok = False
        else: 
            pub_ok = True
        #
        if list(nuple) in already_checked_OK :
            #print "already checked:", nuple
            check = "already_checked" 
        else :
            if list(nuple) not in published_NOK :
                d="/bdd/CMIP6/%s/%s/%s/%s/%s/%s/%s/%s/v%s"%nuple
                # Check the 
                files=glob.glob(d+"/*nc")
                
                # Check if the data directory has already been negatively checked, and 
                # this later than its last change
                timestamp_filename="timestamps/%s.%s.%s.%s.%s.%s.%s.%s.v%s"%nuple
                if os.path.exists(timestamp_filename) :
                    if not os.path.exists(d) :
                        check = "nok"
                    elif os.path.getmtime(timestamp_filename) < os.path.getmtime(d) :
                        # Dir has changed since last check
                        os.remove(timestamp_filename)
                        check = "todo_again"
                    else : 
                        check = "nok"
                else : 
                    #if len(files) == 0 : 
                    #    check = "nok"
                    #else:
                    # Want to create a timestamp file even if data dir does not exist
                    check = "todo"
                #
                if "todo" in check :
                    # Check that the data files do cover the experiment time extent
                    #print "checking files for",nuple
                    print ".",
                    if check_period_length(experiment, filenames=files) :
                        check = "ok"
                    elif pub_ok :
                      # Published period si OK while files period is NOK
                      check = "nok"
                      # Create a file which modification time tells the date of this negative check
                      with open(timestamp_filename,"w") as f : f.write("")
                    else :
                        check = "no_use"
            else :
                check = "no_use"
            
        if check== "nok" :
            #check=glob.glob(os.path.dirname(d))
            #print "%10s"%"missing ",d, check, instance_id
            count_missing+=1
            nok.append(nuple)
            #if model=="IPSL-CM6A-LR" :
            #    print d
        elif check in [ "ok", "already_checked" ]:
            count_ok+=1
            ok.append(nuple)
            feed_dic(dok,None,activity_id,institution_id,model,experiment,table,variable,grid,variant,version)
            if check != "already_checked" :
                already_checked_OK.append(list(nuple))
            #print "%10s"%"ok ",d
        elif check not in  "no_use" :
            raise ValueError("Logic error %s",check)
                
    #nopub=dict()
    #for e in no_published_files :
    #    model,experiment,variable,table,variant,version = e
    #    feed_dic(nopub,1,model,use_count=True)
    #print "Datasets without published files"
    #for model in nopub :
    #    print model,nopub[model]
            
    return ok,dok,nok

## Analyze outputs of function above, including counting those datasets which are missing on /bdd and also do not show a sibling variant on /bdd
### Print a summary of : 
- datasets on ESGF which are present on bdd, 
- datasets on ESGF which are missing on bdd, 
- missing datasets which have a sibling variant on bdd, 
- difference of the two latter counts,
- number of datsets without a sibling variant whcih have an older version on bdd

In [None]:
def analyze_missings(ok,dok,nok) :    
    # Analyze missing datasets which have another variant on /bdd (and/or another version)
    
    # A dict to list missing variants for each dataset
    dnok={}
    #
    count_other_version = 0
    count_other_variant = 0
    miss_with_other_version    = []
    miss_without_other_version = []
    count_missing_single = 0
    for nuple in nok :
        #
        # Check for an older version
        activity_id,institution_id,model,experiment,variant,table,variable,grid,version = nuple
        d="/bdd/CMIP6/%s/%s/%s/%s/%s/%s/%s/%s/v%s/"%nuple
        d=os.path.dirname(os.path.dirname(d))+"/*"
        #print "checking",d
        check=glob.glob(d)
        if len(check) > 0 :
            count_other_version += 1
            other_version=True
            #print nuple,alt
        else : 
            other_version=False
        #
        # Check for a variant OK
        #
        try :
            # Get the list of variants
            variants=dok[activity_id][institution_id][model][experiment][table][variable][grid].keys()
            nb=len(variants)
            if nb > 0 : 
                #print "for",nuple, 'found alt variants',alt
                count_other_variant += 1
            else : 
                raise ValueError("Logic error for "+str(nuple))
        except :
            if other_version :
                miss_with_other_version.append(nuple)
            else :
                miss_without_other_version.append(nuple)
            # Register this (variant,version) in the list of missing variants for this dataset
            feed_dic(dnok,(variant,version),model,experiment,table,variable,grid,use_list=True)

    #
    count_missing=len(nok)
    count_ok=len(ok)
    count_with_other_version=len(miss_with_other_version)
    count_without_other_version=len(miss_without_other_version)    
    #        
    if len(nok)==0 :
        if len(ok) != 0 :
            _,_,_,experiment,_,table,variable,_,_ = ok[0]
        else :
            experiment,table,variable="?","?","?"
    #
    print "%20s"%experiment, "%4s"%table, "%10s"%variable, " OK","%4d"%count_ok, "missing","%3d"%count_missing,
    print "alt_variant","%3d"%count_other_variant, "no_variant","%3d"%(count_missing - count_other_variant),
    print "old_version","%3d"%count_with_other_version, "no_version","%3d"%count_without_other_version, "%3d"%count_other_version
    
    return count_ok,count_missing,count_other_variant,miss_with_other_version,miss_without_other_version,dnok

In [None]:
if False :
    a=[]
    o,d,n=list_datasets("historical","pr","day",limit=200,already_checked_OK=a)
    count_ok,count_missing,count_other_variant,mwi,mwo=analyze_missings(o,d,n)
    #print mwi
    #print mwo

In [None]:
def choose_variant(variants) :
    """ Provided with a list o pairs (variant,version), returns the pair which realization number is the lowest"""
    #
    var,ver=variants[0]
    rmin=int(re.sub(r"r([0-9]*)i(.*)",r"\1",var))
    rep=var,ver
    #
    if len(variants) > 1 :
        for var,ver in variants[1:]:
            r=int(re.sub(r"r([0-9]*)i(.*)",r"\1",var))
            if r < rmin :
                rep=var,ver
    return rep

##  Iterate the analysis of missing datsets over all experiments and variables
### And also compute stats of which variant labels are missing

In [None]:
dic_miss_with    = dict()
dic_miss_without = dict()
missing_variants = dict()
missing_models   = dict()
missing_modvar   = dict()
missing_modvariables = dict()
missing_variables = dict()
missing_varmodels = dict()
Count_ok            = 0
Count_missing       = 0
Count_other_variant = 0
Count_missing_single= 0

try :
    with open("already_checked.json","r") as f :
        already_checked=json.load(f)
except :
    already_checked=[]

try :
    with open("published_nok.json","r") as f :
        published_nok=json.load(f)
except :
    published_nok=[]

try :
    with open("published_ok.json","r") as f :
        published_ok=json.load(f)
except :
    published_ok=[]


with open("missing_datasets.txt","w") as output_file :
    for experiment in experiments :
        print
        for table in variables :
            print
            for variable in variables[table] :
                ok,dok,nok            = list_datasets(experiment,variable,table,
                                            already_checked_OK=already_checked,published_NOK=published_nok,published_OK=published_ok)
                count_ok,count_missing,count_other_variant,miss_with,miss_without,dnok = analyze_missings(ok,dok,nok)
                Count_ok             += count_ok
                Count_missing        += count_missing
                Count_other_variant  += count_other_variant
                feed_dic(dic_miss_with   ,miss_with   ,experiment, table, variable)
                feed_dic(dic_miss_without,miss_without,experiment, table, variable)
                # Count missings without counting twice if two variants are missing
                count_missing_single = 0
                for model in dnok :
                    for experiment in dnok[model]:
                        for table in dnok[model][experiment]:
                            for variable in dnok[model][experiment][table]:
                                for grid in dnok[model][experiment][table][variable]:
                                    count_missing_single +=1
                                    variants=dnok[model][experiment][table][variable][grid]
                                    Variant,Version=choose_variant(variants)
                                    # CMIP6.CMIP.CNRM-CERFACS.CNRM-CM6-1.1pctCO2.r1i1p1f2.Amon.clw.gr.v20180626
                                    output_file.write("CMIP6.%s.%s.%s.%s.%s.%s.%s.%s.v%s %s\n"%\
                                                      (activities[experiment],institutions[model],model,experiment,\
                                                       Variant,table,variable,grid,Version,variants))
                                    feed_dic(missing_models  ,1, model  ,         use_count=True)
                                    var2=variable
                                    if table=="day" : var2=var2+"_day"
                                    feed_dic(missing_modvariables  ,1, model , var2  ,use_count=True)
                                    feed_dic(missing_varmodels     ,1, var2  , model ,use_count=True)
                                    feed_dic(missing_variables     ,1, var2  ,use_count=True)
                Count_missing_single += count_missing_single

with open("already_checked.json","w") as f :
    json.dump(already_checked,f,separators=(',', ': '),indent=1,ensure_ascii=True)

with open("published_nok.json","w") as f :
    json.dump(published_nok,f,separators=(',', ': '),indent=1,ensure_ascii=True)

with open("published_ok.json","w") as f :
    json.dump(published_ok,f,separators=(',', ': '),indent=1,ensure_ascii=True)

Count_published_nok=len(published_nok)

grand_total    = Count_ok + Count_missing + Count_published_nok
total          = Count_ok + Count_missing
really_missing = Count_missing - Count_other_variant

subject = "Missings : per variant %3.1f %%, merging variants :%3.1f %% (%d)"%(100.*Count_missing/total,100.*Count_missing_single/total,Count_missing_single)

msg  = ""
msg +="\nExperiments :%s"%experiments
msg +="\nVariables :%s"%variables
msg += "\n"
msg += "\n# datasets published : %5d (latest versions)"%grand_total
msg += "\n# published OK : %5d (%4.1f %%) (i.e. a long enough period without hole)"%(total,100.*total/grand_total)
msg += "\n  # OK on bdd : %5d"%Count_ok
msg += "\n  # missings (variant per variant) :  %3d (%3.1f %%)"%(Count_missing,100.*Count_missing/total)
msg += "\n  # missings using alt variant : %3d (%3.1f %%)"%(really_missing,100.*really_missing/total)
msg += "\n  # same with single counting for multiple missing variants : %3d (%3.1f %%)"%(Count_missing_single,100.*Count_missing_single/total)
msg += "\n"

msg += "\nMissing datasets per variable (when merging variants): \n"
for model in sorted(missing_variables.keys()):
    msg += "\t%-20s : %d %s\n"%(model, missing_variables[model], missing_varmodels[model])
    
msg += "\nMissing datasets per model (when merging variants): \n"
for model in sorted(missing_modvariables.keys()):
    msg += "\t%-20s : %d %s\n"%(model, missing_models[model], missing_modvariables[model])
    
print msg
amail(msg,subject=subject,sender="Missing_hydro_datasets_on_bdd")