# Importing modules

In [1]:
import pandas as pd
import numpy as np
import time
import warnings
import itertools
from scipy import stats as st
import statsmodels as sm
import statsmodels.api as sma
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import h5py
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats.kde import gaussian_kde
from scipy.stats import norm
import statisticalTools as sT
from sklearn.neighbors.kde import KernelDensity
warnings.filterwarnings("ignore")

%matplotlib inline 
#%matplotlib 
plt.figure(figsize=(15, 6))

  from pandas.core import datetools


<matplotlib.figure.Figure at 0x1a49f64fef0>

<matplotlib.figure.Figure at 0x1a49f64fef0>

# Defining functions

In [2]:
%matplotlib inline 
#%matplotlib 

plt.figure(figsize=(15, 6))


def find_nearest(array, value):
    '''Finds the closest value to a given value in an array. Returns closest value'''
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

def mid(x):
    '''Returns a vector with the mean of two consecutive values.'''
    middle = np.zeros(len(x)-1)
    for i in range(len(x)-1):
        middle[i] = (x[i+1]+x[i])/2
    return middle

def give_me(x0,x1,prob, tol = 0.03,est = 0):
    '''Gives the probability of the values to me inside a zone [va,vb], the most probable [vstar], 
    and the interval length [abs(va-vb)]. Requires the data of a histogram.'''
    
    #The sum of a distribution is <= 1
    if prob > 1:
        return print('Put a value <= than 1')
    if est == 0:
        est = np.diff(x1)*x0 #finds the probabilities of each interval
    else:
        est = x1*x0
    x_max = np.argmax(est) #finds the biggest probability in the interval
    probb = est[x_max]
    inds = [x_max]
    x_act_min = x_max
    x_act_max = x_max
    count = 0
    vstar = (x1[x_max]+x1[x_max+1])/2
    
    while probb + tol < prob:
        #print(x_act_min)
        count = count + 1
        if x_act_min  > 0:
            neig0 = est[x_act_min -1]
        else:
            neig0 = -1
            
        if x_act_max + 1 < len(x0):
            neig1 = est[x_act_max +1]
        else:
            neig1 = -1
        if (neig0 > neig1):
            #print(neig0,neig1)
            probb = probb + neig0
            x_act_min = x_act_min - 1
        elif (neig0 < neig1):
            #print(neig0,neig1)
            probb = probb + neig1
            x_act_max = x_act_max + 1
        else:
            #print(neig0,neig1)
            est[x_max] = 0
            if x_act_min  > 0 and x_act_max + 1 < len(x0):
                x_mmin = np.argmax(est[0:x_act_min])
                x_mmax = np.argmax(est[x_act_max:])
                if est[x_mmin] > est[x_mmax+x_act_max]:
                    probb = probb + neig0
                    x_act_min = x_act_min - 1 
                else:
                    probb = probb + neig1
                    x_act_max = x_act_max + 1
            elif x_act_min > 0:
                probb = probb + neig0
                x_act_min = x_act_min - 1 
            elif x_act_max + 1 < len(x0):
                probb = probb + neig1
                x_act_max = x_act_max + 1
            else:
                #print(x_act_min,x_act_max)
                warnings.warn("Warning: Verify the function give_me or try reducing the probability to 0.99")
                va = x1[x_act_min]
                vb = x1[1 +x_act_max]
                return probb, va, vb, abs(va-vb), vstar
                #continue
                
        if count > len(x0):
            return print('Impossible to finish')
        
    #print(x_act_max)
    va = x1[x_act_min]
    vb = x1[1 +x_act_max]
    return probb, va, vb, abs(va-vb), vstar

def proba(x0,x1):
    '''Finds the probabilities of each interval'''
    return np.diff(x1)*x0

def verify_res(nforc,method,probability = 1, tole = 'None'):
    '''Verify the interval of a distribution given the ensemble forecast data and a proability.'''
    if method == 'hist10':
        ai0, ai1 = np.histogram(nforc, bins = 10, density = 'True')
        if tole == 'None':
            tole = 0.05
        prob, va, vb, dist, vstar = give_me(ai0,ai1,probability,tol = tole)
    elif method == 'kde':
        KDEpdf = gaussian_kde(nforc)
        mean = np.abs(np.mean(nforc))
        xmin = np.min(nforc)- .25*mean
        xmax = np.max(nforc)+ .25*mean
        x = np.linspace(xmin,xmax,100)
        xprob = KDEpdf(x)
        xprob = (xprob[:-1]+xprob[1:])/2
        if tole == 'None':
            tole = 0.0001
        prob, va, vb, dist, vstar = give_me(xprob,x,probability,tol = tole)
    elif method == 'kde_modif':
        KDEpdf = gaussian_kde(nforc)
        samp = KDEpdf.resample([100])
        ai0, ai1 = np.histogram(samp, bins = 10, density = 'True')
        if tole == 'None':
            tole = 0.05
        prob, va, vb, dist, vstar = give_me(ai0,ai1,probability,tol = tole)
    elif method  == 'best_dist':
        name, param = sT.best_fit_distribution(nforc)
        result = getattr(st, name)
        prob = probability
        va,vb = result.interval(prob,*param)
    elif method == 'norm':
        param = norm.fit(nforc)
        prob = probability
        va,vb = norm.interval(prob,*param)
    else:
        return print('Method not known')
    return prob, va, vb

<matplotlib.figure.Figure at 0x1a4a37bdac8>

# Reading data and extracting information

In [3]:
#Read data
file_name = 'data/GENS_test_AllLevels3'+'.hdf'
dt = pd.read_hdf(file_name, 'Wind_vector')
#Sort the dataset in function of date
dt = dt.set_index('Timestamp')
dt = dt.sort_index()

In [4]:
#Read data
file_name = 'data/gens-a_3'+'.hdf'
DT = pd.read_hdf(file_name, 'Wind_vector')
DT = DT.set_index('Timestamp')
DT = DT.sort_index()

In [None]:
#Selects the time steps presents on both datasets.
ind0 = dt.index.unique()
ind1 = DT.index.unique()
ind = set(ind0) & set(ind1)

dtt = dt.loc[ind]
DTT = DT.loc[ind]

dtc = dtt.where(dtt.Forecast == '000').where(dtt.NForecast == 999).dropna(how='all') #Observation - Most probable forecast
dtd = dtt.where(dtt.Forecast == '006').where(dtt.NForecast == 999).dropna(how='all') #Forecast - Most probable forecast

In [None]:
#Selects the latitude, longitudes and levels steps presents on both datasets.
lats0 = dtt.where(dtt.NForecast == 999).dropna(how = 'all').lat.unique()
longs0 = dtt.where(dtt.NForecast == 999).dropna(how = 'all').long.unique()
levels0 = dtt.where(dtt.NForecast == 999).dropna(how = 'all').level.unique()

lats1 = DTT.lat.unique()
longs1 = DTT.long.unique()
levels1 = DTT.level.unique()

lats = list(set(lats0) & set(lats1))
longs = list(set(longs0) & set(longs1))
levels = list(set(levels0) & set(levels1))

#shows the latitudes, longitudes and levels presents in both datasets
print(lats)
print(longs)
print(levels)

In [None]:
#Selecting data to compare

dx = dt
dt = dtt
dt = dt.sort_index()
dt = dt.where(dt.Forecast == '000')
dt = dt.where(dt.NForecast == 999)
dy = dt.dropna()
#dt = dt.ix[dt.index[1]]

dt = dtt
dt = dt.sort_index()
dt = dt.where(dt.Forecast == '006')
dt = dt.where(dt.NForecast == 999)
dyf = dt.dropna()

dt = dx

In [None]:

oi = 0

dfp = DTT
dyp = dy
dyfp = dyf

dfp = dfp.where(dfp.lat == lats[oi] )
dfp = dfp.where(dfp.long == longs[oi] )
dfp = dfp.where(dfp.level == levels[oi] )
dfp = dfp.dropna(how = 'all')

dyp = dyp.where(dyp.lat == lats[oi] )
dyp = dyp.where(dyp.long == longs[oi]) 
dyp = dyp.where(dyp.level == levels[oi]) 
dyp = dyp.dropna(how='all')

dyfp = dyfp.where(dyfp.lat == lats[oi]) 
dyfp = dyfp.where(dyfp.long == longs[oi]) 
dyfp = dyfp.where(dyfp.level == levels[oi]) 
dyfp = dyfp.dropna(how='all')

# Verify results using punctual metric

In [None]:
#defining variables and parameters
count_loop = 0

param = 'Wy'

count = 0
count0 = 0
sizet = 0
evalu = 0

ind0 = dtc.index.unique()
ind1 = DTT.index.unique()
ind = set(ind0) & set(ind1)

dtc = dtc.loc[ind]
DTT = DTT.loc[ind]
dtd = dtd.loc[ind]

lats = DTT.lat.unique()
longs = DTT.long.unique()
levels = DTT.level.unique()

#if one wants to test fast if it works
#levels = levels[6]
#lats = lats[0:1]
#longs = longs[0:1]

In [None]:
t_t = time.time()
for i in levels:
    count_loop += 1
    print(count_loop,'of',len(levels), end = ' ||| ')
    for j in range(len(lats)):
        for k in range(len(longs)):
            temp = dtc.where(dtc.level == i)
            temp = temp.where(temp.lat == lats[j]).where(temp.long == longs[k]).dropna(how='all')
                
            #temp0 = DTT.where(DTT.level == i)
            #temp0 = temp0.where(temp0.lat == lats[j]).where(temp0.long == longs[k]).dropna(how='all')
            
            temp1 = dtd.where(dtd.level == i)
            temp1 = temp1.where(temp1.lat == lats[j]).where(temp1.long == longs[k]).dropna(how='all')
            
            for timer in ind:
                
                Obs = temp.loc[timer]
                #Nforc = temp0.loc[timer]
                ONEforc = temp1.loc[timer]

                #param = 'Wy'
                #nforc = Nforc[param]
                oneforc = ONEforc[param]
                Obs = Obs[param]
                
                probability= .7
                prob,va,vb = verify_res(nforc,'norm',probability,tole = 'None')
                
                Result_max = (Obs < vb)
                Result_min = (Obs > va)
                Result = (Result_max & Result_min)
                
                count += Result
                sizet += 1
                evalu += prob
                
                
    print(str(time.time()-t_t)+ " have passed since the start of the calcul.")  
    print(str( (time.time()-t_t)*(len(levels)-count_loop)/count_loop)+ "s to finish")  
print(str(time.time()-t_t)+ " seconds to calculate")            
evalu = evalu/sizet

print('#','prob == ', probability, 'val =', count/sizet*100,'%','evalu=',evalu*100,'%')

In [None]:
alpha = 1
prop = evalu
print('Considered proportion:',prop)
zval,pval = sm.stats.proportion.proportions_ztest(count,sizet*alpha,prop)
print(count,'occurrences amid a total of',sizet*alpha)
print('P-value of the test:',pval,'meaning the test is:',pval>0.05)