# Preprocess the soil moisutre from the international soil moisture network 

## Soil moisture from ISMN

In [1]:
import os
import re
import pandas as pd
import numpy as np
import utils
import glob

HOME_DIR = r"E:\Zoho WorkDrive (YICODE)\My Folders\TimeSeriesRetrieval\Extension"
# path to the the raw data downloaded from the ISMN
network_dir=os.path.join(HOME_DIR,'ISMN_raw')
# path to the site specific daily averaged soil moisutre (<= 5cm )
out_dir=os.path.join(HOME_DIR,'daily_ave')
# path to a table with the details of sites
site_info_file=os.path.join(HOME_DIR,'site_info.csv')
s_time="2016-01-01" # start and end date
e_time="2019-12-31"

Get all the raw soil moisture files for the layer of <= 5 cm 

In [2]:
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
sm_file_list=utils.listdir_sm(network_dir)
print(network_dir.split('\\')[-1]+' files: '+str(len(sm_file_list)))

ISMN_raw files: 1615


Calculate daily average soil moisture for each site and save the multiple measurements of the same site to one file

In [5]:
for file_sm in sm_file_list:
    #file_sm=sm_file_list[0]
    file_ts=file_sm.replace('sm','ts') 
    h,sm=utils.readstm_all(file_sm,'sm',s_time,e_time)# read sm
    #sm=sm.groupby(level=0).mean()
    if type(sm)!=pd.DataFrame:
        continue
    
    if os.path.exists(file_ts):
        _,ts=utils.readstm_all(file_ts,'ts',s_time,e_time)# read surface temperature
        #ts=ts.groupby(level=0).mean()
    else:
        ts = pd.DataFrame(np.nan, index=sm.index, columns=['ts']) # for sites without ts measurements, nan was used
    site_out = pd.concat([sm,ts['ts']],axis=1)
    #site_out = site_out.loc[:,~site_out.columns.duplicated()]# drop duplicated columns
    #site_file=os.path.join(out_dir,h.loc[0][0]+'_'+h.loc[0][1]+'_'+str(h.loc[0][4])+'_'+str(h.loc[0][5])+'.csv')
    site_file=os.path.join(out_dir,h.loc[0][0]+'_'+h.loc[0][1]+'.csv') # All the observations < 5 cm was averaged
    if os.path.exists(site_file):
        # average the values observated at the same depth or the target layer
        site_pre=pd.read_csv(site_file,index_col="time", parse_dates=True)
        #site_out['sample_num']=site_pre['sample_num']+1
        #print(h.loc[0][0]+'_'+h.loc[0][1]+':'+str(site_out['sample_num'][0]))
        #site_out['sm']=(site_pre['sm']*site_pre['sample_num']+site_out['sm'])/(site_out['sample_num'])
        #site_out['ts']=(site_pre['ts']*site_pre['sample_num']+site_out['ts'])/(site_out['sample_num'])
        site_out=site_pre.append(site_out)
        #site_out=site_out.groupby(level=0).mean()
        site_out.to_csv(site_file)
    else:
        #site_out['sample_num']=1
        site_out.to_csv(site_file)
        sm_file_dir,_=os.path.split(file_sm)# extract soil texture
        site_static_file=glob.glob(os.path.join(sm_file_dir,'*.csv'))[0]
        clay, sand = utils.parse_site_soil_texture(site_static_file)
        h['clay']=clay
        h['sand']=sand
        h['slit']=1-clay-sand
        if os.path.exists(site_info_file):
            site_info_pre=pd.read_csv(site_info_file)
            site_info_out=site_info_pre.append(h)
            site_info_out=site_info_out.drop_duplicates()
            site_info_out.to_csv(site_info_file,index=False)
        else:
            h.to_csv(site_info_file,index=False)

Average multiple measurements collected from the same layer or depth

In [6]:
site_files = os.listdir(out_dir)
print('Number of sites including a few Yanco sites : %s'%len(site_files))
for site_file in site_files: # calculate the mean of multiple measurements within the same layer or depth 
    site_out=pd.read_csv(os.path.join(out_dir,site_file),index_col="time", parse_dates=True)
    sm_count=site_out['sm'].groupby(level=0).count().rename('sm_count')
    site_out=site_out.groupby(level=0).mean()
    site_out = pd.concat([site_out,sm_count],axis=1) # each row includes the dateframe, sm, ts and number of sm measurements 
    site_out.to_csv(os.path.join(out_dir,site_file))

Number of sites including a few Yanco sites : 1323
