In [1]:
import tarfile 
import datetime as dt
import cdflib
import pandas as pd
from requests import get # to make GET request
import numpy as np
import os
import glob

from C1_Cluster_CDF_conv import C1_cdf_conv
from C2_Cluster_CDF_conv import C2_cdf_conv
from C3_Cluster_CDF_conv import C3_cdf_conv
from C4_Cluster_CDF_conv import C4_cdf_conv

In [2]:
#define download function for calling data

def download(url, params, file_name):
    # open in binary mode
    with open(file_name, "wb") as file:
        # get request
        response = get(url, params=params)
        # write to file
        file.write(response.content)


In [None]:
#input data list and download tarfiles. return list of filenames

def dl_Cluster_data(cl_1, cl_2, cl_3, cl_4):
    
    #drop unnecessary rows (without data in!)
    un_rows = [0,1,2,3,4]

    cl_1 = cl_1.drop(un_rows)
    cl_2 = cl_2.drop(un_rows)
    cl_3 = cl_3.drop(un_rows)
    cl_4 = cl_4.drop(un_rows)

    #list of interval start and end points
    cl_1_ints_start = cl_1['# SC: 1'].tolist()
    cl_1_ints_end = cl_1[' 2'].tolist()
    cl_2_ints_start = cl_2['# SC: 1'].tolist()
    cl_2_ints_end = cl_2[' 2'].tolist()
    cl_3_ints_start = cl_3['# SC: 1'].tolist()
    cl_3_ints_end = cl_3[' 2'].tolist()
    cl_4_ints_start = cl_4['# SC: 1'].tolist()
    cl_4_ints_end = cl_4[' 2'].tolist()

    myurl = 'https://csa.esac.esa.int/csa-sl-tap/data'

    #step one: iterate over every one of the intervals, downloading the tarfiles into a folder
    #note filenames in list for later use
    tarfilelist_1 = []
    tarfilelist_2 = []
    tarfilelist_3 = []
    tarfilelist_4 = []


    for i, j in zip(cl_1_ints_start, cl_1_ints_end):
        filename = 'C1tap' + i + '.tgz'
        tarfilelist_1.append(filename)
        query_specs = {'RETRIEVAL_TYPE': 'product',
                   'DATASET_ID': 'C1_CP_FGM_FULL',
                   'START_DATE': i,
                   'END_DATE': j,
                   'DELIVERY_FORMAT': 'CDF',
                   'DELIVERY_INTERVAL': 'daily'}
        download(myurl, query_specs, filename)

    for i, j in zip(cl_2_ints_start, cl_2_ints_end):
        filename = 'C2tap' + i + '.tgz'
        tarfilelist_2.append(filename)
        query_specs = {'RETRIEVAL_TYPE': 'product',
                   'DATASET_ID': 'C2_CP_FGM_FULL',
                   'START_DATE': i,
                   'END_DATE': j,
                   'DELIVERY_FORMAT': 'CDF',
                   'DELIVERY_INTERVAL': 'daily'}
        download(myurl, query_specs, filename)

    for i, j in zip(cl_3_ints_start, cl_3_ints_end):
        filename = 'C3tap' + i + '.tgz'
        tarfilelist_3.append(filename)
        query_specs = {'RETRIEVAL_TYPE': 'product',
                   'DATASET_ID': 'C3_CP_FGM_FULL',
                   'START_DATE': i,
                   'END_DATE': j,
                   'DELIVERY_FORMAT': 'CDF',
                   'DELIVERY_INTERVAL': 'daily'}
        download(myurl, query_specs, filename)

    for i, j in zip(cl_4_ints_start, cl_4_ints_end):
        filename = 'C4tap' + i + '.tgz'
        tarfilelist_4.append(filename)
        query_specs = {'RETRIEVAL_TYPE': 'product',
                   'DATASET_ID': 'C4_CP_FGM_FULL',
                   'START_DATE': i,
                   'END_DATE': j,
                   'DELIVERY_FORMAT': 'CDF',
                   'DELIVERY_INTERVAL': 'daily'}
        download(myurl, query_specs, filename)
        
    tarfilelist = tarfilelist_1 + tarfilelist_2 + tarfilelist_3 + tarfilelist_4
        
    return(tarfilelist)
        

In [20]:
def tarf_extract(tarfilelist):

    for i in tarfilelist:
        with tarfile.open(i) as tar:
            tarname = tar.getnames()
            tar.extractall(path='/Users/apx059/Documents/23_Years_CDFs')

In [17]:
#input data list and download tarfiles. return list of filenames

def dl_Cluster_data_single(int_df, sc_no, batch):

    #list of interval start and end points
    batch_start =  batch*100
    batch_end = (batch+1)*100
    ints_start_list = int_df['# SC: 1'].tolist()
    ints_start = ints_start_list[batch_start:batch_end]
    ints_end_list = int_df[' 2'].tolist()
    ints_end = ints_end_list[batch_start:batch_end]
    
    myurl = 'https://csa.esac.esa.int/csa-sl-tap/data'

    #step one: iterate over every one of the intervals, downloading the tarfiles into a folder
    #note filenames in list for later use
    tarfilelist = []

    if sc_no == '1':
        for i, j in zip(ints_start, ints_end):
            filename = '/Users/apx059/Documents/23_Years_CDFs/Tarfiles/C1tap' + i + '.tgz'
            tarfilelist.append(filename)
            query_specs = {'RETRIEVAL_TYPE': 'product',
                       'DATASET_ID': 'C1_CP_FGM_FULL',
                       'START_DATE': i,
                       'END_DATE': j,
                       'DELIVERY_FORMAT': 'CDF',
                       'DELIVERY_INTERVAL': 'daily'}
            download(myurl, query_specs, filename)
            
    if sc_no == '2':
        for i, j in zip(ints_start, ints_end):
            filename = '/Users/apx059/Documents/23_Years_CDFs/Tarfiles/C2tap' + i + '.tgz'
            tarfilelist.append(filename)
            query_specs = {'RETRIEVAL_TYPE': 'product',
                       'DATASET_ID': 'C2_CP_FGM_FULL',
                       'START_DATE': i,
                       'END_DATE': j,
                       'DELIVERY_FORMAT': 'CDF',
                       'DELIVERY_INTERVAL': 'daily'}
            download(myurl, query_specs, filename)
    
    if sc_no == '3':
        for i, j in zip(ints_start, ints_end):
            filename = '/Users/apx059/Documents/23_Years_CDFs/Tarfiles/C3tap' + i + '.tgz'
            tarfilelist.append(filename)
            query_specs = {'RETRIEVAL_TYPE': 'product',
                       'DATASET_ID': 'C3_CP_FGM_FULL',
                       'START_DATE': i,
                       'END_DATE': j,
                       'DELIVERY_FORMAT': 'CDF',
                       'DELIVERY_INTERVAL': 'daily'}
            download(myurl, query_specs, filename)

    if sc_no == '4':
        for i, j in zip(ints_start, ints_end):
            filename = '/Users/apx059/Documents/23_Years_CDFs/Tarfiles/C4tap' + i + '.tgz'
            tarfilelist.append(filename)
            query_specs = {'RETRIEVAL_TYPE': 'product',
                       'DATASET_ID': 'C4_CP_FGM_FULL',
                       'START_DATE': i,
                       'END_DATE': j,
                       'DELIVERY_FORMAT': 'CDF',
                       'DELIVERY_INTERVAL': 'daily'}
            download(myurl, query_specs, filename)

    return(tarfilelist)


In [None]:
%%time
#run for 1 year, 01-02-2001 - 01-02-2002

cluster_1_46 = pd.read_csv("/Users/apx059/Documents/Cluster Intervals 16032001-01022002/dm-intervals-c1-240731-131130.csv")
cluster_2_46 = pd.read_csv("/Users/apx059/Documents/Cluster Intervals 16032001-01022002/dm-intervals-c2-240731-131130.csv")
cluster_3_46 = pd.read_csv("/Users/apx059/Documents/Cluster Intervals 16032001-01022002/dm-intervals-c3-240731-131130.csv")
cluster_4_46 = pd.read_csv("/Users/apx059/Documents/Cluster Intervals 16032001-01022002/dm-intervals-c4-240731-131130.csv")

tf_list = dl_Cluster_data(cluster_1_46, cluster_2_46, cluster_3_46, cluster_4_46)

tarf_extract(tf_list)

In [None]:
def csvconv(path):

    #now make those cdfs into CSVs!

    #automated list of pathnames in input CDF folder

    list_all = []
    for path1 in glob.glob(path, recursive=True):
        list_all.append(path1)

    #list with only files, not folders
    list_files = []

    for element in list_all:
        if '.cdf' in element:
            list_files.append(element)
            
    #now iterate over list to convert

    df_list_c1 = []
    df_list_c2 = []
    df_list_c3 = []
    df_list_c4 = []


    for i in list_files:
        if 'C1' in i:
            df_c1 = C1_cdf_conv(i)
            df_list_c1.append(df_c1)
        if 'C2' in i:
            df_c2 = C2_cdf_conv(i)
            df_list_c2.append(df_c2)
        if 'C3' in i:   
            df_c3 = C3_cdf_conv(i)
            df_list_c3.append(df_c3)   
        if 'C4' in i:   
            df_c4 = C4_cdf_conv(i)
            df_list_c4.append(df_c4)

    #now save as CSVs

    #first append dataframes that *aren't* empty to new list (to avoid errors)
    df_full_c1 = []
    df_full_c2 = []
    df_full_c3 = []
    df_full_c4 = []

    for df in df_list_c1:
        a = df.empty
        if not a:
            df_full_c1.append(df)

    for df in df_list_c2:
        a = df.empty
        if not a:
            df_full_c2.append(df)

    for df in df_list_c3:
        a = df.empty
        if not a:
            df_full_c3.append(df)

    for df in df_list_c4:
        a = df.empty
        if not a:
            df_full_c4.append(df) 


    #then generate a list of file names
    df_names_1 = []
    df_names_2 = []
    df_names_3 = []
    df_names_4 = []

    #find first date-time value in index, stringify it and make filename

    for df in df_full_c1:
        start_time = df.index[0]
        start = str(start_time)
        df_names_1.append(start)


    for df in df_full_c2:
        start_time = df.index[0]
        start = str(start_time)
        df_names_2.append(start)

    for df in df_full_c3:
        start_time = df.index[0]
        start = str(start_time)
        df_names_3.append(start)

    for df in df_full_c4:
        start_time = df.index[0]
        start = str(start_time)
        df_names_4.append(start)
        
    for fname, df in zip(df_names_1,df_full_c1):
        fname_full = '/Users/apx059/Documents/1 Yr Data/48 Weeks CSVs/' + fname + 'C1.csv'
        df.to_csv(fname_full, encoding='utf-8')
        
    for fname, df in zip(df_names_2,df_full_c2):
        fname_full = '/Users/apx059/Documents/1 Yr Data/48 Weeks CSVs/' + fname + 'C2.csv'
        df.to_csv(fname_full, encoding='utf-8')

    for fname, df in zip(df_names_3,df_full_c3):
        fname_full = '/Users/apx059/Documents/1 Yr Data/48 Weeks CSVs/' + fname + 'C3.csv'
        df.to_csv(fname_full, encoding='utf-8')

    for fname, df in zip(df_names_4,df_full_c4):
        fname_full = '/Users/apx059/Documents/1 Yr Data/48 Weeks CSVs/' + fname + 'C4.csv'
        df.to_csv(fname_full, encoding='utf-8')

        

In [None]:
%%time
#convert one csv as test
df_c1 = C1_cdf_conv('/Users/apx059/Documents/1 Yr Data/48 Weeks CDFs/CSA_Download_20240804_1636/C1_CP_FGM_FULL/C1_CP_FGM_FULL__20010316_000000_20010316_191300_V140306.cdf')

In [None]:
csvconv('/Users/apx059/Documents/1 Yr Data/48 Weeks CDFs/**')

In [5]:
c123 = pd.read_csv('/Users/apx059/Documents/Cluster Intervals-01022002-01022024/dm-intervals-c1-240902-144411.csv')
c123.shape[0]

2906

In [14]:
cluster_1_23yr = pd.read_csv("/Users/apx059/Documents/Cluster Intervals-01022002-01022024/dm-intervals-c1-240902-144411.csv")
cluster_2_23yr = pd.read_csv("/Users/apx059/Documents/Cluster Intervals-01022002-01022024/dm-intervals-c2-240902-144411.csv")
cluster_3_23yr = pd.read_csv("/Users/apx059/Documents/Cluster Intervals-01022002-01022024/dm-intervals-c3-240902-144411.csv")
cluster_4_23yr = pd.read_csv("/Users/apx059/Documents/Cluster Intervals-01022002-01022024/dm-intervals-c4-240902-144411.csv")

#drop unnecessary rows (without data in!)
un_rows = [0,1,2,3,4]

cluster_1_23yr = cluster_1_23yr.drop(un_rows)
cluster_2_23yr = cluster_2_23yr.drop(un_rows)
cluster_3_23yr = cluster_3_23yr.drop(un_rows)
cluster_4_23yr = cluster_4_23yr.drop(un_rows)

cluster_1_23yr = cluster_1_23yr.reindex()
cluster_2_23yr = cluster_2_23yr.reindex()
cluster_3_23yr = cluster_3_23yr.reindex()
cluster_4_23yr = cluster_4_23yr.reindex()


In [19]:
#go in steps of 100

batch0 = dl_Cluster_data_single(cluster_1_23yr, '1', 0)

In [21]:
%%time

tarf_extract(batch0)

CPU times: user 29.3 s, sys: 2.51 s, total: 31.8 s
Wall time: 32.8 s


In [22]:
%%time
#transform one cdf to csv

df_c1 = C1_cdf_conv('/Users/apx059/Documents/23_Years_CDFs/CSA_Download_20240916_1227/C1_CP_FGM_FULL/C1_CP_FGM_FULL__20020201_000000_20020201_004500_V150212.cdf')

CPU times: user 1.43 s, sys: 88.2 ms, total: 1.52 s
Wall time: 1.49 s
