# Importing modules

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import gdal
import ogr
import osr
import gdalnumeric
import gdalconst
from osgeo.gdalconst import GA_ReadOnly
import cartopy.crs as ccrs
import pandas as pd
import sys

# Defining base functions

In [None]:
#all Functions

def find_band_number(dataset, variable):
    '''
    Finds the band number and level inside the GRIB file, given the variable
    '''
    i_list = []
    level_list  =[]
    for i in range(1,dataset.RasterCount + 1):
        band = dataset.GetRasterBand(i)
        metadata = band.GetMetadata()
        band_level = metadata['GRIB_SHORT_NAME']
        band_variable = metadata['GRIB_ELEMENT']
        level = band_level[-4:]
        if (variable == band_variable) and (level == 'ISBL'):
            i_list = i_list + [i]
            level = band_level[0:-5]
            level_list = level_list + [level]
            #return i
    return i_list, level_list #retun a list with the number in the band of the variable and its ISBL level

def to_DataFrame(xv,yv,lats,longs,level,lat_min,lat_max,long_min,long_max,time):
    '''
    Converts the data in a DataFrame, format used by Pandas in python
    '''
    
    nb = xv.shape[0]*xv.shape[1]

    levels = int(level)*np.ones([nb])
    time_list = [time]*nb
    xv = np.resize(xv,[nb])
    yv = np.resize(yv,[nb])
    lats = np.resize(lats,[nb])
    longs = np.resize(longs,[nb])
    
    dic = { 'Wx': xv, 'Wy':yv,'lat':lats,'long':longs,'level':levels,'Timestamp':time_list }
    dt = pd.DataFrame(data = dic)
    
    #dt.columns = ['Wx', 'Wy','lat','long','level','Timestamp']
    dt = dt.where(dt.lat >= lat_min)
    dt = dt.where(dt.lat <= lat_max)
    dt = dt.where(dt.long >= long_min)
    dt = dt.where(dt.long <= long_max)
    dt = dt.dropna(axis=0, how = 'all')
    return dt
    
def get_Timestamp(Y, M, D, hour, forc):
    '''
    Take as input the YMD (year, month and day in the format YYYYMMDD), hour (format HH) and forc 
    (format HHH - hours betweent forecast and simulation). The output is a pandas Timestamp. 
    '''
    hour = hour[0:2]
    hour = int(hour)
    forc = int(forc)
    hour = hour + forc
    if hour > 23:
        
        D = int(D) + int(hour/24)
        if D < 10:
            D = '0' + str(D)
        else:
            D = str(D)
        hour = hour%24
    if hour < 10:
        hour = '0'+str(hour)
    else:
        hour = str(hour)
    time = Y + M + D + ' ' + hour
    
    try:
        timestamp =  pd.Timestamp(time)
    
    except ValueError:
        try:
            M = int(M) + 1
            D = '01'
            if M < 10:
                M = '0' + str(M)
            else:
                M = str(M)
            time = Y + M + D + ' ' + hour
            timestamp =  pd.Timestamp(time)
        except:
            Y = int(Y) +1
            Y = str(Y)
            M = '01'
            timestamp =  pd.Timestamp(time)
    return timestamp

# Defining variables to the conversion

In [None]:
# takes the date from the downloaded data to read the data itself and save with the correct informations in the
#DataFrame
year = '2018'
monthh = ['01','02','03','04','05','06','07','08','09','10','11','12']
dayy = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16',
        '17','18','19','20','21','22','23','24','25','26','27','28','29','30','31']
hour = ['0000','0600','1200','1800']
forc = list(np.arange(0,121,3))
for i in range(len(forc)): 
    if forc[i] < 10:
        forc[i] = '00' + str(forc[i])
    elif forc[i] < 100:
        forc[i] = '0' + str(forc[i])
    else:
        forc[i] = str(forc[i])
data_type = 'gfs_3'

In [None]:
# defines the latitude and longitude to be considered

# If wanted, define latitude and longitude parameters to get values
#Amsterdam
#lat_max = 55 #52.3740300
#long_max = 7  #4.8896900
##London
#lat_min = 48 #51.5085300
##long_min = -3 #-0.1257400

#Else let the following interval to take all globe
lat_max = 1000
lat_min = -1000
long_max = 1000
long_min = -1000

#if only one level is needed
#only_level = '20000'

# Converting from grib to DataFrame

In [None]:
# initalizing a new list to construct the Dataframe
DT = []

#looping between the forecast data
for month in monthh:
    YM = year + month 
    print(YM)
    for day in dayy:
        D = day
        #print(YM+D)
        for i1 in range(len(hour)):
            for j1 in range(len(forc)):
                hour1 = hour[i1]
                forc1 = forc[j1]
                file_name = data_type +  '_' +  YM + D + '_'   + hour[i1] + '_' + forc[j1] + '.grb2'
                # if necessary change the directory name 'data_Grib' to your own directory name.
                data_file = 'data_Grib/' + file_name
                #verify if the file do exist or not
                if os.path.isfile(data_file):
                    #converts the information in a Timestamp
                    time = get_Timestamp(year, month, D, hour1, forc1)
                    if time == np.nan:
                        break    
                    #print('YMD:',YMD,'hour:',hour1,'forc',forc1)
                    
                   
                    
                    #The parsing of the Grib is based in: 
                        #http://geoexamples.blogspot.com/2013/05/drawing-wind-barbs-gdal-python.html
                    # importing data from Grib file
                    dataset = gdal.Open(data_file, GA_ReadOnly )
                    try:
                        #get variables
                        u_band_id,u_band_level = find_band_number(dataset, 'UGRD')
                        v_band_id,v_band_level = find_band_number(dataset, 'VGRD')
                        # take only one level if variable is defined
                        if 'only_level' in globals():
                            ind_lvl = u_band_level.index(only_level)
                            u_band_id = [ u_band_id[ind_lvl] ]
                        for i in range (len(u_band_id)):
                            band_u = dataset.GetRasterBand(u_band_id[i])
                            band_v = dataset.GetRasterBand(v_band_id[i])
                            level = u_band_level[i]
                            geo = dataset.GetGeoTransform()

                            xsize = band_u.XSize
                            ysize = band_u.YSize

                            values_u = band_u.ReadAsArray(0, 0, xsize, ysize)
                            values_v = band_v.ReadAsArray(0, 0, xsize, ysize)

                            longs = np.arange(geo[0],geo[1]*xsize+geo[0],geo[1])
                            lats = np.arange(geo[3],geo[5]*ysize+geo[3],geo[5])
                            for j in range(len(longs)):
                                if longs[j]>180:
                                    longs[j] = longs[j] - 360
                            longs, lats = np.meshgrid(longs, lats)

                            #converts file to DataFrame
                            dt = to_DataFrame(values_u,values_v,lats,longs,level,lat_min,lat_max,long_min,long_max,time)
                            #put all DataFrames in a list
                            DT =  DT + [dt]
                    # to continue in case of an error
                    except:
                        print('Error in:', file_name)
                        continue
print('It is over!!!')

# Save the DataFrames

In [None]:
my_data = pd.concat(DT) #concatenates all DataFrames in a single one
# shows the first lines of the DataFrame
my_data.head()
# sort values in respect to the Timestamp
my_data = my_data.sort_values('Timestamp')
# take the name of the file from the keybord
name = input("Choose a filename: ")
# saves the file with the given name and format .hdf
file_name =  name + '.hdf'
my_data.to_hdf(file_name, 'Wind_vector' ,mode='w')