# Importing modules

In [1]:
import pandas as pd
import numpy as np
import time
import warnings
import itertools
from scipy import stats as st
import statsmodels as sm
import statsmodels.api as sma
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import h5py
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats.kde import gaussian_kde
from scipy.stats import norm
import statisticalTools as sT
from sklearn.neighbors.kde import KernelDensity
warnings.filterwarnings("ignore")

%matplotlib inline 
#%matplotlib 
plt.figure(figsize=(15, 6))

  from pandas.core import datetools


<matplotlib.figure.Figure at 0x1b27264cef0>

<matplotlib.figure.Figure at 0x1b27264cef0>

# Defining functions

In [2]:
def mid(x):
    '''Returns a vector with the mean of two consecutive values.'''
    middle = np.zeros(len(x)-1)
    for i in range(len(x)-1):
        middle[i] = (x[i+1]+x[i])/2
    return middle

# Reading data and extracting information

In [3]:
#Read data
file_name = 'data/GENS_test_AllLevels3'+'.hdf'
dt = pd.read_hdf(file_name, 'Wind_vector')
#Sort the dataset in function of date
dt = dt.set_index('Timestamp')
dt = dt.sort_index()

In [4]:
#Read data
file_name = 'data/gens-a_3'+'.hdf'
DT = pd.read_hdf(file_name, 'Wind_vector')
DT = DT.set_index('Timestamp')
DT = DT.sort_index()

In [7]:
#Selects the time steps presents on both datasets.
ind0 = dt.index.unique()
ind1 = DT.index.unique()
ind = set(ind0) & set(ind1)

dtt = dt.loc[ind]
DTT = DT.loc[ind]

dtc = dtt.where(dtt.Forecast == '000').where(dtt.NForecast == 999).dropna(how='all') #Observation - Most probable forecast
dtd = dtt.where(dtt.Forecast == '006').where(dtt.NForecast == 999).dropna(how='all') #Forecast - Most probable forecast

In [9]:
ind

{Timestamp('2018-05-01 06:00:00'),
 Timestamp('2018-05-01 12:00:00'),
 Timestamp('2018-05-01 18:00:00'),
 Timestamp('2018-05-02 00:00:00'),
 Timestamp('2018-05-02 06:00:00'),
 Timestamp('2018-05-02 12:00:00'),
 Timestamp('2018-05-02 18:00:00'),
 Timestamp('2018-05-03 00:00:00'),
 Timestamp('2018-05-03 06:00:00'),
 Timestamp('2018-05-03 12:00:00'),
 Timestamp('2018-05-03 18:00:00'),
 Timestamp('2018-05-04 00:00:00'),
 Timestamp('2018-05-04 06:00:00'),
 Timestamp('2018-05-04 12:00:00'),
 Timestamp('2018-05-04 18:00:00'),
 Timestamp('2018-05-05 00:00:00'),
 Timestamp('2018-05-05 06:00:00'),
 Timestamp('2018-05-05 12:00:00'),
 Timestamp('2018-05-05 18:00:00'),
 Timestamp('2018-05-06 00:00:00'),
 Timestamp('2018-05-06 06:00:00'),
 Timestamp('2018-05-06 12:00:00'),
 Timestamp('2018-05-06 18:00:00'),
 Timestamp('2018-05-07 00:00:00'),
 Timestamp('2018-05-07 06:00:00'),
 Timestamp('2018-05-07 12:00:00'),
 Timestamp('2018-05-07 18:00:00'),
 Timestamp('2018-05-08 00:00:00'),
 Timestamp('2018-05-

In [8]:
#Selects the latitude, longitudes and levels steps presents on both datasets.
lats0 = dtt.where(dtt.NForecast == 999).dropna(how = 'all').lat.unique()
longs0 = dtt.where(dtt.NForecast == 999).dropna(how = 'all').long.unique()
levels0 = dtt.where(dtt.NForecast == 999).dropna(how = 'all').level.unique()

lats1 = DTT.lat.unique()
longs1 = DTT.long.unique()
levels1 = DTT.level.unique()

lats = list(set(lats0) & set(lats1))
longs = list(set(longs0) & set(longs1))
levels = list(set(levels0) & set(levels1))

#shows the latitudes, longitudes and levels presents in both datasets
print(lats)
print(longs)
print(levels)

[50.5, 51.5, 52.5, 53.5, 54.5, 55.5, 56.5, 57.5, 58.5, 59.5]
[0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5]
[100000.0, 20000.0, 40000.0, 1000.0, 85000.0, 5000.0, 25000.0, 30000.0, 10000.0, 70000.0, 50000.0, 92500.0]


## Prints one example

In [None]:
#Takes one sample of the data
df = DTT

for i in range(1):
    
    df = df.where(df.lat == lats[0] )
    df = df.where(df.long == longs[0] )
    df = df.where(df.level == levels[0] )
    df = df.dropna(how = 'all')
    
    oi = np.random.randint(0,len(df))
    df = df.loc[df.index[oi]]
    print(oi)
    
    
df['Wx'].plot(kind="hist",bins=10)
plt.show()
df['Wx'].plot(kind="kde")
plt.show()

# Vizualising data

In [None]:
#Selecting data to compare

dx = dt
dt = dtt
dt = dt.sort_index()
dt = dt.where(dt.Forecast == '000')
dt = dt.where(dt.NForecast == 999)
dy = dt.dropna()
#dt = dt.ix[dt.index[1]]

dt = dtt
dt = dt.sort_index()
dt = dt.where(dt.Forecast == '006')
dt = dt.where(dt.NForecast == 999)
dyf = dt.dropna()

dt = dx

In [None]:
#Calculating plots

oi = 0

dfp = DTT
dyp = dy
dyfp = dyf

dfp = dfp.where(dfp.lat == lats[oi] )
dfp = dfp.where(dfp.long == longs[oi] )
dfp = dfp.where(dfp.level == levels[oi] )
dfp = dfp.dropna(how = 'all')

dyp = dyp.where(dyp.lat == lats[oi] )
dyp = dyp.where(dyp.long == longs[oi]) 
dyp = dyp.where(dyp.level == levels[oi]) 
dyp = dyp.dropna(how='all')

dyfp = dyfp.where(dyfp.lat == lats[oi]) 
dyfp = dyfp.where(dyfp.long == longs[oi]) 
dyfp = dyfp.where(dyfp.level == levels[oi]) 
dyfp = dyfp.dropna(how='all')

## wrt time

In [None]:
# Winds -> Wx

az = dyp.Wx.plot(label='Observed',marker='o',linestyle='',figsize=(15, 6), color = 'yellow',markersize = 8)
ax = dfp.Wx.plot(label='forecast',marker='1',linestyle='',figsize=(15, 6), color = 'red')
azz = dyfp.Wx.plot(label='1Forecast',marker='*',linestyle='',figsize=(15, 6), color = 'k')

ax.set_xlabel('Date')
ax.set_ylabel('Winds - Wx')

ini = 3;end = ini +20  * 5
axes = plt.gca()
#axes.set_xlim([dfp.index[ini-2],dfp.index[end+2]])
#axes.set_ylim([-15, 0])
plt.legend()
plt.show()

In [None]:
# Winds -> Wy

az = dyp.Wy.plot(label='Observed',marker='o',linestyle='',figsize=(15, 6), color = 'yellow',markersize = 8)
ax = dfp.Wy.plot(label='forecast',marker='1',linestyle='',figsize=(15, 6), color = 'red')
azz = dyfp.Wy.plot(label='1Forecast',marker='*',linestyle='',figsize=(15, 6), color = 'k')

ax.set_xlabel('Date')
ax.set_ylabel('Winds - Wy')
axes = plt.gca()
plt.legend()
plt.show()

In [None]:
# Winds -> Absolute value - W_abs

az = np.sqrt(dyp.Wx*dyp.Wx+dyp.Wy*dyp.Wy).plot(label='Observed',marker='o',linestyle='',figsize=(15, 6), color = 'yellow',markersize = 8)
ax = np.sqrt(dfp.Wx*dfp.Wx+dfp.Wy*dfp.Wy).plot(label='forecast',marker='1',linestyle='',figsize=(15, 6), color = 'red')
azz = np.sqrt(dyfp.Wx*dyfp.Wx+dyfp.Wy*dyfp.Wy).plot(label='1Forecast',marker='*',linestyle='',figsize=(15, 6), color = 'k')

ax.set_xlabel('Date')
ax.set_ylabel('Vents - W_abs')
axes = plt.gca()
plt.legend()
plt.show()

In [None]:
# Winds -> direction - Angle

az = np.degrees(np.arctan2(dyp.Wy,dyp.Wx)).plot(label='Observed',marker='o',linestyle='',figsize=(15, 6), color = 'yellow',markersize = 8)
ax = np.degrees(np.arctan2(dfp.Wy,dfp.Wx)).plot(label='forecast',marker='1',linestyle='',figsize=(15, 6), color = 'red')
azz = np.degrees(np.arctan2(dyfp.Wy,dyfp.Wx)).plot(label='1Forecast',marker='*',linestyle='',figsize=(15, 6), color = 'k')

ax.set_xlabel('Date')
ax.set_ylabel('Winds - Angle (degrees)')
axes = plt.gca()
axes.set_ylim([-200, 200])
plt.legend()
plt.show()

## wrt the probabilities distribution

### Calculating values

In [None]:
#PDF and CDF

#CDF
val = np.array (df.Wx.values)
dist = ECDF(val)
ser = df.Wx
sq = ser.value_counts()
sp = sq.sort_index().cumsum()*1./len(df.Wx)

##
d1_np = np.array(df.Wx)
mu,stdv = norm.fit(d1_np)

name, param = sT.best_fit_distribution(d1_np,10)
result = getattr(st, name)

# Estimating the pdf
KDEpdf = gaussian_kde(d1_np)
xmean = np.abs(np.mean(d1_np))
xmin = np.min(d1_np)-.2*xmean
xmax = np.max(d1_np)+.2*xmean
x = np.linspace(xmin,xmax,1000)

# Estimating cdf
KDEcdf = np.zeros([len(x),1])
for i in range(len(x)):
    KDEcdf[i] = KDEpdf.integrate_box_1d(-np.inf,x[i])

### Plots

In [None]:
#CDF
plt.plot(dist.x,dist.y, label = 'Empirical CDF', color = 'k',alpha = .8)
ser.hist(cumulative=True, density=1, bins=10, label = 'Hist 10 bins', color = 'c')
plt.plot(sp, marker = 'X', linestyle='',label = 'Empirical points', color = 'w')
plt.plot(x,result.cdf(x,*param), label = name)
plt.plot(x,KDEcdf, label = 'kde')
plt.legend()
plt.show()

In [None]:
##PDFs
plt.plot(x,KDEpdf(x),'r',label="KDE estimation",color="blue")
plt.hist(d1_np,normed=1,color="cyan",alpha=.8,label = 'Data hist')
plt.plot(x,norm.pdf(x,mu,stdv),label="PDF - Normal",color="red", linestyle = '--')
plt.plot(x,result.pdf(x,*param), color = 'gold', label= name)
plt.legend()
plt.title("Returns: PDF")
plt.show()

##CDFs
plt.plot(x,norm.cdf(x,mu,stdv),label="CDF - Normal",color="r")
plt.plot(sp, marker = 'X', linestyle='',color = 'cyan',label='Empirical CDF')
plt.plot(x,result.cdf(x,*param), label = name, linestyle = '--',color ='gold')
plt.plot(x,KDEcdf, label = 'kde', color = 'blue')
plt.legend()
plt.title("Returns: CDF")
plt.show()

##Random sample
plt.plot(d1_np[:],label="Actual", marker = 'o', linestyle = '-',markerfacecolor = 'white')
plt.plot(KDEpdf.resample(size=20)[0],label="KDE",marker = 'o', linestyle = ':',markerfacecolor = 'k')
plt.plot(result.rvs(*param,size=20),label=name, marker = 'o', linestyle = '--',markerfacecolor = 'g')
plt.title("Simulated returns from KDE estimation compared to actual return")
plt.legend()
plt.show()

#Verifying sampled histogram
plt.hist(d1_np,label="Actual", normed = 1, color = 'k')
n = 1000
plt.hist(KDEpdf.resample(size=20*n)[0],label="KDE",alpha = .7,normed=True, color = 'green')
plt.hist(result.rvs(*param,size=20*n),label=name,alpha = .7,normed = True, color = 'red')
plt.legend()
plt.show()

In [None]:
#Possibles Histograms

ri0, ri1 = np.histogram(d1_np, bins = 20, density = 'True')
ai0, ai1 = np.histogram(d1_np, bins = 10, density = 'True')
bi0, bi1 = np.histogram(d1_np, bins = 'doane', density = 'True')
ci0, ci1 = np.histogram(d1_np, bins = 'fd', density = 'True')
di0, di1 = np.histogram(result.rvs(*param,size = 100), bins ='doane',density = 'True')

plt.plot(mid(ri1),ri0,linestyle = '', marker = 'X', label = 'ref 20')
plt.plot(mid(ai1),ai0,linestyle = '', marker = 'X', label = '10 bins')
plt.plot(mid(bi1),bi0,linestyle = '', marker = 'X', label = 'doane')
plt.plot(mid(ci1),ci0,linestyle = '', marker = 'X', label = 'fd')
plt.plot(mid(di1),di0,linestyle = '', marker = 'X', label = name)

plt.legend()
plt.show()