In [3]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import glob
import struct
import random
import itertools
import matplotlib.gridspec as gridspec


In [4]:
def tostr(l):
    return "".join(i.decode() for i in l)

def read_header(f):
    satcode = tostr(struct.unpack("ccccc", f.read(5)))
    sensor  = tostr(struct.unpack("ccccc", f.read(5)))
    freq_sat = np.array(struct.unpack("f" * 13, f.read(13 * 4)))
    vangles  = np.array(struct.unpack("f" * 13, f.read(13 * 4)))
    return (satcode, sensor, freq_sat, vangles)

def read_pixel(f):
        nx, ny     = struct.unpack("ii", f.read(8))
        scntime    = np.array(struct.unpack("i" * 6, f.read(4 * 6)))
        lat, lon   = struct.unpack("ff", f.read(8))
        sfccode    = struct.unpack("i", f.read(4))
        tcwv       = struct.unpack("f", f.read(4))
        T2m        = struct.unpack("f", f.read(4))
        Tbs        = np.array(struct.unpack("f" * 13, f.read(13*4)))
        sfcprcp    = struct.unpack("f", f.read(4))
        cnvprcp    = struct.unpack("f", f.read(4))
        return (nx, ny, scntime, lat, lon, sfccode, tcwv, T2m, Tbs, sfcprcp, cnvprcp)

def read_multiple_files(path_list, n_rows, n_files, prob_thresh):
    #---------------------------------------------------#
    # Takes a list of file paths for custom .dat files
    # from one given month.
    # Goes through selected file:
    # 1: checks if ocean
    # 2: saves the information with some probability
    # Returns pixel information
    #---------------------------------------------------#
    lats = []
    lons = []
    sfccode = []
    tcwv = []
    T2m = []
    Tbs = []
    surf_precip = []
    
    for i in range(0,n_files):
        # Loop over selected number of files in month folder
        if np.mod(i,3) == 0:
            print(str((i/n_files)*100) + ' % complete')
        path = path_list[i]
        with open(path, 'rb') as fn:
            read_header(fn) #Reads header, doesn't save anything
            for i in range(0,n_rows): #Loop over selected number of rows in given file
                try:
                    test = read_pixel(fn)
                    tmp_surf = test[5][0]
                    if tmp_surf == 1: # Makes sure that pixel is over ocean
                        r = random.random()
                        if r < prob_thresh: # Only save info with probability prob_thresh
                            lats.append(test[3])
                            lons.append(test[4])
                            sfccode.append(test[5][0])
                            tcwv.append(test[6][0])
                            T2m.append(test[7][0])
                            Tbs.append(test[8][:])
                            surf_precip.append(test[9][0])
                except:
                    break
    return (lats, lons, sfccode, tcwv, T2m, Tbs, surf_precip)

def create_training_data(n_rows_per_file, n_files_per_month, prob_to_include, day_str):
    #--------------------------------------------#
    # Wrapper function to read .dat files from
    # multiple months
    #--------------------------------------------#
    path = '/home/teodor/Dendrite/UserAreas/Teo/GPROFfiles/'
    months = ['1409','1410','1411','1412','1501','1502','1503','1504','1505','1506','1507','1508']
    lats = []
    lons = []
    sfccode = []
    tcwv = []
    T2m = []
    Tbs = []
    surf_precip = []
    
    for i in range(0,len(months)):
        print('Starting month: ' + str(months[i]))
        files_path = path + months[i] + '/*.dat'
        files_list_tmp = np.sort(glob.glob(files_path))
        files_list = []
        
        for k in range(0,len(files_list_tmp)):
            tmp_str = files_list_tmp[k]
            tmp_test = str(months[i]) + day_str
            if tmp_test not in tmp_str:
                files_list.append(tmp_str)
                
        file_inds = np.random.choice(np.arange(0,len(files_list)),n_files_per_month)
        files_list = np.array(files_list)
        files_list_final = files_list[file_inds]
        
        lats_tmp, lons_tmp, sfccode_tmp, tcwv_tmp, T2m_tmp, Tbs_tmp, surf_precip_tmp = read_multiple_files(files_list_final,
                                                                                                           n_rows_per_file,
                                                                                                           n_files_per_month,
                                                                                                           prob_to_include)
        lats.extend(lats_tmp)
        lons.extend(lons_tmp)
        sfccode.extend(sfccode_tmp)
        tcwv.extend(tcwv_tmp)
        T2m.extend(T2m_tmp)
        Tbs.extend(Tbs_tmp)
        surf_precip.extend(surf_precip_tmp)
    return (lats, lons, sfccode, tcwv, T2m, Tbs, surf_precip)

def create_input_and_output_arrays_ocean(Tbs, tcwv, T2m, surf_precip):
    Tbs_arr = np.array(Tbs)
    tcwv_arr = np.array(tcwv).reshape(len(tcwv),1)
    T2m_arr = np.array(T2m).reshape(len(T2m),1)

    input_array = np.concatenate((Tbs_arr,tcwv_arr), axis=1)
    input_array = np.concatenate((input_array,T2m_arr), axis = 1)
    
    output_array = np.array(surf_precip)
    return input_array, output_array

In [5]:
for i in range(10,20):
    print('--- STARTING FILE NBR ' + str(i+1) + ' ---')
    lats, lons, sfccode, tcwv, T2m, Tbs, surf_precip = create_training_data(1000000000,10,0.1,'01')
    x, y = create_input_and_output_arrays_ocean(Tbs,tcwv,T2m, surf_precip)
    print(x.shape)
    input_string = 'training_input_' + str(i+1) + '.txt'
    output_string = 'training_output_' + str(i+1) + '.txt'
    np.savetxt(input_string, x)
    np.savetxt(output_string, y)

--- STARTING FILE NBR 11 ---
Starting month: 1409
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1410
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1411
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1412
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1501
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1502
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1503
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1504
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1505
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1506
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1507
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1508
0.0 % complete
30.0 % comp

90.0 % complete
Starting month: 1507
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1508
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
(420563, 15)
--- STARTING FILE NBR 19 ---
Starting month: 1409
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1410
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1411
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1412
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1501
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1502
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1503
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1504
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 1505
0.0 % complete
30.0 % complete
60.0 % complete
90.0 % complete
Starting month: 15