# This script reads metadata in a .csv, cleans and formats the data, and exports the result as a .csv for use in climate research.

In [131]:
import pandas as pd

In [132]:
# This is the raw metadata generated by the csvgen tool
df = pd.read_csv('climatedata_updated.csv')

In [133]:
# This is the table of variable names compiled from CORDEX and the IPCC standard output
df_lookup = pd.read_csv('deduped_lookup.csv')

In [134]:
# This is the table that contains dimension labels for each variable
df_dimensions = pd.read_csv('deduped_dimensions.csv')

In [135]:
# This merges the raw metadata and variable name table using the 'variable' column as the join index
complete_variables = pd.merge(df,df_lookup, on='variable')

Unnamed: 0,variable,institute,model,domain,product,project,realm,ensemble,experiment,time,...,cmor_table,data_node,size,version,url,latest,checksum,checksum_type,variable_standard_name,variable_long_name
0,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,19490101-19491231,...,,,382756372,v20120622,,,09afd442bd308eb4417d3298ed7b4cca1befeec5c20864...,sha256,geopotential_height,Geopotential Height
1,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,18920101-18921231,...,,,382756372,v20120622,,,03b5a3a37f5721014a154b8369c246d33cd8fa8b216442...,sha256,geopotential_height,Geopotential Height
2,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,19200101-19201231,...,,,382756372,v20120622,,,0464996fdfd0f2c421cabdd659ed92d878c2f8022797e4...,sha256,geopotential_height,Geopotential Height
3,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,19180101-19181231,...,,,382756372,v20120622,,,01fa89bb3dd9fc88b3d211f8bcf52d55463599a102b1f3...,sha256,geopotential_height,Geopotential Height
4,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,18590101-18591231,...,,,382756372,v20120622,,,08274327510b144eda2135ad9c15c661cff66126f1b51f...,sha256,geopotential_height,Geopotential Height
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219375,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r2i1p1,rcp26,200601-210012,...,Amon,esgf-data1.ceda.ac.uk,37397292,v20130331,http://esgf-data1.ceda.ac.uk/thredds/fileServe...,1.0,77e6a8b729379bd0441ce1c81e198de53e26da32d7e3fd...,sha256,atmosphere_water_vapor_content,Water Vapor Path
219376,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r4i1p1,rcp26,200601-210012,...,,esgfcog.cccma.ec.gc.ca,37397292,v20130331,http://esgfcog.cccma.ec.gc.ca/thredds/fileServ...,1.0,7ba45fe481b9f8ba33f3526f921eafc8fd096cedd84477...,sha256,atmosphere_water_vapor_content,Water Vapor Path
219377,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r4i1p1,rcp26,200601-210012,...,Amon,esgf-data1.ceda.ac.uk,37397292,v20130331,http://esgf-data1.ceda.ac.uk/thredds/fileServe...,1.0,7ba45fe481b9f8ba33f3526f921eafc8fd096cedd84477...,sha256,atmosphere_water_vapor_content,Water Vapor Path
219378,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r3i1p1,rcp26,200601-210012,...,,esgfcog.cccma.ec.gc.ca,37397292,v20130331,http://esgfcog.cccma.ec.gc.ca/thredds/fileServ...,1.0,9a18865af3104f7a01f001fc1e27cdfa5cc48aa87e6fa9...,sha256,atmosphere_water_vapor_content,Water Vapor Path


In [136]:
df = pd.merge(complete_variables, df_dimensions, on='variable')

Unnamed: 0,variable,institute,model,domain,product,project,realm,ensemble,experiment,time,...,data_node,size,version,url,latest,checksum,checksum_type,variable_standard_name,variable_long_name,dimensions
0,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,19490101-19491231,...,,382756372,v20120622,,,09afd442bd308eb4417d3298ed7b4cca1befeec5c20864...,sha256,geopotential_height,Geopotential Height,3D
1,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,18920101-18921231,...,,382756372,v20120622,,,03b5a3a37f5721014a154b8369c246d33cd8fa8b216442...,sha256,geopotential_height,Geopotential Height,3D
2,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,19200101-19201231,...,,382756372,v20120622,,,0464996fdfd0f2c421cabdd659ed92d878c2f8022797e4...,sha256,geopotential_height,Geopotential Height,3D
3,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,19180101-19181231,...,,382756372,v20120622,,,01fa89bb3dd9fc88b3d211f8bcf52d55463599a102b1f3...,sha256,geopotential_height,Geopotential Height,3D
4,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,18590101-18591231,...,,382756372,v20120622,,,08274327510b144eda2135ad9c15c661cff66126f1b51f...,sha256,geopotential_height,Geopotential Height,3D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219375,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r2i1p1,rcp26,200601-210012,...,esgf-data1.ceda.ac.uk,37397292,v20130331,http://esgf-data1.ceda.ac.uk/thredds/fileServe...,1.0,77e6a8b729379bd0441ce1c81e198de53e26da32d7e3fd...,sha256,atmosphere_water_vapor_content,Water Vapor Path,2D
219376,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r4i1p1,rcp26,200601-210012,...,esgfcog.cccma.ec.gc.ca,37397292,v20130331,http://esgfcog.cccma.ec.gc.ca/thredds/fileServ...,1.0,7ba45fe481b9f8ba33f3526f921eafc8fd096cedd84477...,sha256,atmosphere_water_vapor_content,Water Vapor Path,2D
219377,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r4i1p1,rcp26,200601-210012,...,esgf-data1.ceda.ac.uk,37397292,v20130331,http://esgf-data1.ceda.ac.uk/thredds/fileServe...,1.0,7ba45fe481b9f8ba33f3526f921eafc8fd096cedd84477...,sha256,atmosphere_water_vapor_content,Water Vapor Path,2D
219378,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r3i1p1,rcp26,200601-210012,...,esgfcog.cccma.ec.gc.ca,37397292,v20130331,http://esgfcog.cccma.ec.gc.ca/thredds/fileServ...,1.0,9a18865af3104f7a01f001fc1e27cdfa5cc48aa87e6fa9...,sha256,atmosphere_water_vapor_content,Water Vapor Path,2D


## Update Path for local and OSCER path


In [137]:
# Formats fullpath to export path to data in OSCER

def newpath(item):
    root = item.split("/")[1]
    remaining_path = "/".join(item.split("/")[2:])
    if root == "data":
        fullpath = f"/condo/climatedata3/{remaining_path}"
    else:
        fullpath = f"/condo/climate{root}/{remaining_path}"
    return fullpath


df['OSCER_schooner_path'] = df['local_file'].apply(newpath)

In [138]:
#Strips "/condo/climate" from  full path to make (local) SCCASC_path

def localpath(item):
    root = item.split("/")[1]
    remaining_path = "/".join(item.split("/")[2:])
    if root == "data":
        fullpath = f"/data3/{remaining_path}"
    else:
        fullpath = f"/{root}/{remaining_path}"
    return fullpath

df['SCCASC_climatedata_path'] = df['local_file'].apply(localpath)


## Find duplicate entries

In [139]:
duplicated = df[df.duplicated(["size", "checksum"], False)]

In [140]:
unique = df[~df.duplicated(["size", "checksum"], False)]

## Deduplicate entries and recombine

In [141]:
deduplicated = duplicated.drop_duplicates(["size", "checksum"], keep="first")

In [142]:
complete_deduplicated = pd.concat([unique, deduplicated])

In [143]:
# Extract beginning year
complete_deduplicated['beg_year'] = complete_deduplicated.time.str.split("-").apply(lambda x: int(x[0][0:4]))

In [144]:
# Exclude anything starting after year 2101
cleaned = complete_deduplicated[complete_deduplicated.beg_year < 2101]

Unnamed: 0,variable,institute,model,domain,product,project,realm,ensemble,experiment,time,...,url,latest,checksum,checksum_type,variable_standard_name,variable_long_name,dimensions,OSCER_schooner_path,SCCASC_climatedata_path,beg_year
0,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,19490101-19491231,...,,,09afd442bd308eb4417d3298ed7b4cca1befeec5c20864...,sha256,geopotential_height,Geopotential Height,3D,/condo/climatedata1/synda/data/cmip5/output2/M...,/data1/synda/data/cmip5/output2/MIROC/MIROC5/h...,1949
1,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,18920101-18921231,...,,,03b5a3a37f5721014a154b8369c246d33cd8fa8b216442...,sha256,geopotential_height,Geopotential Height,3D,/condo/climatedata1/synda/data/cmip5/output2/M...,/data1/synda/data/cmip5/output2/MIROC/MIROC5/h...,1892
2,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,19200101-19201231,...,,,0464996fdfd0f2c421cabdd659ed92d878c2f8022797e4...,sha256,geopotential_height,Geopotential Height,3D,/condo/climatedata1/synda/data/cmip5/output2/M...,/data1/synda/data/cmip5/output2/MIROC/MIROC5/h...,1920
3,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,19180101-19181231,...,,,01fa89bb3dd9fc88b3d211f8bcf52d55463599a102b1f3...,sha256,geopotential_height,Geopotential Height,3D,/condo/climatedata1/synda/data/cmip5/output2/M...,/data1/synda/data/cmip5/output2/MIROC/MIROC5/h...,1918
4,zg,MIROC,MIROC5,day,output,CMIP5,atmos,r5i1p1,historical,18590101-18591231,...,,,08274327510b144eda2135ad9c15c661cff66126f1b51f...,sha256,geopotential_height,Geopotential Height,3D,/condo/climatedata1/synda/data/cmip5/output2/M...,/data1/synda/data/cmip5/output2/MIROC/MIROC5/h...,1859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219370,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r1i1p1,rcp26,200601-210012,...,http://esgfcog.cccma.ec.gc.ca/thredds/fileServ...,1.0,4107865bfd2278670d46a600799c9ea448d65840bf28f4...,sha256,atmosphere_water_vapor_content,Water Vapor Path,2D,/condo/climatedata4/gsmwork/data/cmip5/output/...,/data4/gsmwork/data/cmip5/output/CCCma/CanESM2...,2006
219372,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r5i1p1,rcp26,200601-210012,...,http://esgfcog.cccma.ec.gc.ca/thredds/fileServ...,1.0,00114d1dec7c67b7ce5d34a5ac56d96e8cf829c3172609...,sha256,atmosphere_water_vapor_content,Water Vapor Path,2D,/condo/climatedata4/gsmwork/data/cmip5/output/...,/data4/gsmwork/data/cmip5/output/CCCma/CanESM2...,2006
219374,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r2i1p1,rcp26,200601-210012,...,http://esgfcog.cccma.ec.gc.ca/thredds/fileServ...,1.0,77e6a8b729379bd0441ce1c81e198de53e26da32d7e3fd...,sha256,atmosphere_water_vapor_content,Water Vapor Path,2D,/condo/climatedata4/gsmwork/data/cmip5/output/...,/data4/gsmwork/data/cmip5/output/CCCma/CanESM2...,2006
219376,prw,CCCma,CanESM2,Amon,output,CMIP5,atmos,r4i1p1,rcp26,200601-210012,...,http://esgfcog.cccma.ec.gc.ca/thredds/fileServ...,1.0,7ba45fe481b9f8ba33f3526f921eafc8fd096cedd84477...,sha256,atmosphere_water_vapor_content,Water Vapor Path,2D,/condo/climatedata4/gsmwork/data/cmip5/output/...,/data4/gsmwork/data/cmip5/output/CCCma/CanESM2...,2006


In [145]:
# Define column headers and sort data
headers = (['variable', 'variable_standard_name', 'variable_long_name', 'institute',
       'model', 'domain', 'dimensions', 'project', 'realm', 'ensemble',
       'experiment', 'time_frequency', 'time', 'version', 'filename', 'size',
       'OSCER_schooner_path', 'SCCASC_climatedata_path', 'checksum_type',
       'checksum'])
cleaned = cleaned.sort_values(['variable', 'model', 'experiment', 'time_frequency'])

In [146]:
# Reorder columns
df = cleaned[['variable', 'variable_standard_name', 'variable_long_name', 'institute',
       'model', 'domain', 'dimensions', 'project', 'realm', 'ensemble',
       'experiment', 'time_frequency', 'time', 'version', 'filename', 'size',
       'OSCER_schooner_path', 'SCCASC_climatedata_path', 'checksum_type',
       'checksum']]

Unnamed: 0,variable,variable_standard_name,variable_long_name,institute,model,domain,dimensions,project,realm,ensemble,experiment,time_frequency,time,version,filename,size,OSCER_schooner_path,SCCASC_climatedata_path,checksum_type,checksum
176179,clt,cloud_area_fraction,Total Cloud Fraction,CSIRO-BOM,ACCESS1-0,day,2D,CMIP5,atmos,r1i1p1,historical,day,20000101-20051231,v20131108,clt_day_ACCESS1-0_historical_r1i1p1_20000101-2...,244170468,/condo/climatedata4/synda/data/cmip5/output1/C...,/data4/synda/data/cmip5/output1/CSIRO-BOM/ACCE...,sha256,8e98462e34c0d6bb52a1c66e746fb508be3a5764819d21...
176180,clt,cloud_area_fraction,Total Cloud Fraction,CSIRO-BOM,ACCESS1-0,day,2D,CMIP5,atmos,r1i1p1,historical,day,19500101-19741231,v20131108,clt_day_ACCESS1-0_historical_r1i1p1_19500101-1...,1017064044,/condo/climatedata4/synda/data/cmip5/output1/C...,/data4/synda/data/cmip5/output1/CSIRO-BOM/ACCE...,sha256,653827333b43585cf765d88452bf7bc50b20d9892e506c...
176181,clt,cloud_area_fraction,Total Cloud Fraction,CSIRO-BOM,ACCESS1-0,day,2D,CMIP5,atmos,r1i1p1,historical,day,19750101-19991231,v20131108,clt_day_ACCESS1-0_historical_r1i1p1_19750101-1...,1017064044,/condo/climatedata4/synda/data/cmip5/output1/C...,/data4/synda/data/cmip5/output1/CSIRO-BOM/ACCE...,sha256,bd0010424edbdbc6a6db509db3eefeb55c92ba15afab2e...
176182,clt,cloud_area_fraction,Total Cloud Fraction,CSIRO-BOM,ACCESS1-0,day,2D,CMIP5,atmos,r1i1p1,historical,day,20000101-20051231,v4,clt_day_ACCESS1-0_historical_r1i1p1_20000101-2...,244170468,/condo/climatedata4/synda/data/cmip5/output1/C...,/data4/synda/data/cmip5/output1/CSIRO-BOM/ACCE...,sha256,72c18e9934cd861a60d0d2d5fec5b4d2b28e8bd3840936...
176183,clt,cloud_area_fraction,Total Cloud Fraction,CSIRO-BOM,ACCESS1-0,day,2D,CMIP5,atmos,r1i1p1,historical,day,19500101-19741231,v4,clt_day_ACCESS1-0_historical_r1i1p1_19500101-1...,1017064044,/condo/climatedata4/synda/data/cmip5/output1/C...,/data4/synda/data/cmip5/output1/CSIRO-BOM/ACCE...,sha256,ca069dd0089d86d345ec01ac99d810b1d5800710959b72...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7777,zg,geopotential_height,Geopotential Height,INM,inmcm4,Amon,3D,CMIP5,atmos,r1i1p1,rcp85,mon,206601-207512,v20130207,zg_Amon_inmcm4_rcp85_r1i1p1_206601-207512.nc,176274224,/condo/climatedata1/synda/data/cmip5/output1/I...,/data1/synda/data/cmip5/output1/INM/inmcm4/rcp...,sha256,7df9197add1678054b8464b02b97417c8d83e6ba5c3be0...
7778,zg,geopotential_height,Geopotential Height,INM,inmcm4,Amon,3D,CMIP5,atmos,r1i1p1,rcp85,mon,209601-210012,v20130207,zg_Amon_inmcm4_rcp85_r1i1p1_209601-210012.nc,88144784,/condo/climatedata1/synda/data/cmip5/output1/I...,/data1/synda/data/cmip5/output1/INM/inmcm4/rcp...,sha256,bce7c82c6a5b2d1c2dfba188bb9baedc2d0e4ce4a37777...
7779,zg,geopotential_height,Geopotential Height,INM,inmcm4,Amon,3D,CMIP5,atmos,r1i1p1,rcp85,mon,208601-209512,v20130207,zg_Amon_inmcm4_rcp85_r1i1p1_208601-209512.nc,176274224,/condo/climatedata1/synda/data/cmip5/output1/I...,/data1/synda/data/cmip5/output1/INM/inmcm4/rcp...,sha256,8db544611a8ca47c1e035b29df8d69208868a78e1a2a05...
7780,zg,geopotential_height,Geopotential Height,INM,inmcm4,Amon,3D,CMIP5,atmos,r1i1p1,rcp85,mon,200601-201512,v20130207,zg_Amon_inmcm4_rcp85_r1i1p1_200601-201512.nc,176274224,/condo/climatedata1/synda/data/cmip5/output1/I...,/data1/synda/data/cmip5/output1/INM/inmcm4/rcp...,sha256,fc6b8067dbde07df845929d1fa18371df00561db4143b8...


In [148]:
# Export cleaned, deduplicated .csv
cleaned.to_csv("CMIP5_climatedata_full.csv", index=False, columns=headers)