In [None]:
! git clone https://github.com/sunone5/pyESN/

In [None]:
! cp /kaggle/working/pyESN/testing.py .
! cp /kaggle/working/pyESN/freqgen.ipynb .
! cp /kaggle/working/pyESN/mackey_glass_t17.npy .
! cp /kaggle/working/pyESN/mackey.ipynb .
! cp /kaggle/working/pyESN/pyESN.py .

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import pandas_profiling

from pyESN import ESN

from warnings import filterwarnings
filterwarnings('ignore')

import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

%matplotlib inline
pd.set_option('max.columns', None)

In [None]:
df_stock_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
df_financials = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv")
df_options = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/options.csv")
df_stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
df_trades = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/trades.csv')
df_secondary_sp = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/secondary_stock_prices.csv")
df_supplemental_sp = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")
df_supplemental_ssp = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/secondary_stock_prices.csv")
df_test_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv")

In [None]:
datasets = [df_stock_prices,df_financials,df_options,df_stock_list,df_trades,df_secondary_sp,df_supplemental_sp,df_supplemental_ssp]

labels = ['stock_prices','financials','options','stock_list','trades','secsp','supsp','supsecsp']

sizes = [len(i) for i in datasets]

plt.figure(figsize = (10,6))

sns.barplot(x = labels , y = sizes)

In [None]:
tuple(zip(labels,sizes))

In [None]:
stock_prices_profile = df_stock_prices.profile_report(title='SP Profiling Report')
stock_prices_profile

In [None]:
df_stock_prices.head(5)

In [None]:
df_stock_prices.describe()

In [None]:
secondary_sp_profile = df_secondary_sp.profile_report(title='SSP Profiling Report', minimal=True)
secondary_sp_profile

In [None]:
supplemental_sp_profile = df_supplemental_sp.profile_report(title='SuplimentalSP Profiling Report', minimal=True)
supplemental_sp_profile

In [None]:
supplemental_ssp_profile = df_supplemental_ssp.profile_report(title='SuplimentalSecSP Profiling Report', minimal=True)
supplemental_ssp_profile

In [None]:
financials_profile = df_financials.profile_report(title='Financials Profiling Report', minimal=True)
financials_profile

In [None]:
options_profile = df_options.profile_report(title='Options Profiling Report', minimal=True)
options_profile

In [None]:
stock_list_profile = df_stock_list.profile_report(title='Stock_list Profiling Report', minimal=True)
stock_list_profile

In [None]:
trades_profile = df_trades.profile_report(title='Trades Profiling Report', minimal=True)
trades_profile

> stock_prices - ExpectedDividend has 2313666 (99.2%) missing values
> 
> secondary_sp - ExpectedDividend has 2366117 (99.2%) missing values
> 
> supplemental_sp - ExpectedDividend has 194037 (99.0%) missing values
> 
> supplemental_ssp - ExpectedDividend has 205111 (99.1%) missing values

#### Since ExpectedDividend column in each data set contains mostly missing/null values. We can drop these columns along with row ids.

In [None]:
# Append the secondary prices and supplimental prices into stock price

prices=df_stock_prices.append(df_secondary_sp,ignore_index=True)
prices=prices.append(df_supplemental_sp,ignore_index=True)
prices=prices.append(df_supplemental_ssp,ignore_index=True)

In [None]:
# Drop Expected Dividend along with RowId
df_prices = prices.drop(['RowId','ExpectedDividend'],axis=1, errors = 'ignore')
#df_new_prices = df_prices.dropna()
df_prices.head()

In [None]:
# Check Closing price has missing values/number of recoreds

df_prices[df_prices['Close'].isnull()].shape

In [None]:
df_prices[df_prices['Close'].isnull()].head()

In [None]:
# convert date to datetime

df_prices['Date'] = pd.to_datetime(df_prices['Date'])

df_prices['Year'] = df_prices['Date'].dt.year

df_prices['Month'] = df_prices['Date'].dt.month

df_prices['Day'] = df_prices['Date'].dt.day

In [None]:
# Draw number of stocks with missing Close price

sns.histplot(df_prices[df_prices['Close'].isnull()]['Date'].sort_values())

plt.xlabel('Date')

plt.ylabel('Number of Stocks with Missing Close Price')

plt.show()

We can see a spike in missing closing price information towards the end of 2020.

In [None]:
# how many days information does each stock have?

sns.histplot(df_prices.groupby('SecuritiesCode').size())

plt.xlabel('Count of Days')

plt.ylabel('Number of Stocks')

plt.show()

Most stocks contain information of 1100-1200 days.

In [None]:
# To Find out Number of unique stocks per year

stocks_unique = df_prices[['Year','SecuritiesCode']].drop_duplicates()

stocks_unique.groupby(['Year']).size().plot(kind = 'bar')

plt.ylabel('Number of stocks')

# Investigate about the Stock List

In [None]:
# industry wise stock distribution

plt.figure(figsize = (20,8))

y = df_stock_list['33SectorName'].value_counts().sort_values(ascending = False)
x = df_stock_list['33SectorName'].value_counts().sort_values(ascending = False).index
sns.barplot(x=x,y=y)

plt.xticks(rotation = 90)

plt.show()

# Investigate about the - Target and Target Means

In [None]:
# the mean target distribution for each stock in each year since year 2017

df_annual_mean_target = df_prices.groupby(['Year','SecuritiesCode'])['Target'].mean().unstack(level = 0)

year = 2017
fig = plt.figure(figsize = (20,4))

for i in range(1,6):
  plt.subplot(1,5,i)
  sns.histplot(df_annual_mean_target[year])
  plt.xlim([-0.02,+0.02])
  plt.ylabel('')
  plt.title('Target Means')
  year+=1

In [None]:
# standard deviation of the target means in each year since year 2017

df_annual_std_target = df_prices.groupby(['Year','SecuritiesCode'])['Target'].std().unstack(level = 0)

year = 2017
fig = plt.figure(figsize = (20,4))

for i in range(1,6):
  plt.subplot(1,5,i)
  sns.histplot(df_annual_std_target[year])
  plt.xlim([-0.1,0.1])
  plt.ylabel('')
  plt.title('Target Means')
  year+=1

In [None]:
# For looking at sectors we join two DF df_prices & df_stock_list

df_stock_prices_with_info = pd.merge(df_prices,df_stock_list, on = 'SecuritiesCode')

# number of sectors

df_stock_prices_with_info['33SectorName'].nunique()

In [None]:
# Find out target for major sectors

fig = plt.figure(figsize = (16,4))

plt.subplot(1,3,1)
sns.histplot(df_stock_prices_with_info[df_stock_prices_with_info['33SectorName']=='Information & Communication']['Target'])
plt.title('Information and Communication')

plt.subplot(1,3,2)
sns.histplot(df_stock_prices_with_info[df_stock_prices_with_info['33SectorName']=='Services']['Target'])
plt.title('Services')

plt.subplot(1,3,3)
sns.histplot(df_stock_prices_with_info[df_stock_prices_with_info['33SectorName']=='Retail Trade']['Target'])
plt.title('Retail Trade')

plt.show()

In [None]:
n_stocks_per_date = df_prices.groupby(['Date'])['SecuritiesCode'].count()
n_dates_per_stock = df_prices.groupby(['SecuritiesCode'])['Date'].count()
target_mean_per_stock = df_prices.groupby(['SecuritiesCode'])['Target'].mean()


plt.figure(figsize = (10,10))
sns.regplot(x=n_dates_per_stock, y=target_mean_per_stock)
plt.xlabel('Number of dates per Stock')
plt.ylabel('Target Mean')
plt.show()

# Analyze selected one random stock

In [None]:
# Select one random stock price by it's SecuritiesCode

df_any_one_stock = df_prices[df_prices['SecuritiesCode']==1377]
df_any_one_stock.head()

In [None]:
# Price & Volume 
from plotly.subplots import make_subplots

fig2 = make_subplots(specs=[[{"secondary_y": True}]])
fig2.add_trace(go.Scatter(x=df_any_one_stock['Date'],y=df_any_one_stock['Close'],name='Price'),secondary_y=False)
fig2.add_trace(go.Bar(x=df_any_one_stock['Date'],y=df_any_one_stock['Volume'],name='Volume'),secondary_y=True)

max_vol = df_any_one_stock['Volume'].max()

fig2.update_yaxes(range=[0,max_vol*4],secondary_y=True)
fig2.update_yaxes(visible=False, secondary_y=True)

fig2.show()

In [None]:
# differencing Close & Openning price
df_any_one_stock['diff'] = df_any_one_stock['Close'] - df_any_one_stock['Open']

df_any_one_stock.loc[df_any_one_stock['diff']>=0, 'color'] = 'green'
df_any_one_stock.loc[df_any_one_stock['diff']<0, 'color'] = 'red'

fig3 = make_subplots(specs=[[{"secondary_y": True}]])
fig3.add_trace(go.Candlestick(x=df_any_one_stock['Date'],
                              open=df_any_one_stock['Open'],
                              high=df_any_one_stock['High'],
                              low=df_any_one_stock['Low'],
                              close=df_any_one_stock['Close'],
                              name='Price'))
fig3.add_trace(go.Scatter(x=df_any_one_stock['Date'],y=df_any_one_stock['Close'].rolling(window=20).mean(),marker_color='blue',name='20 Day MA'))

max_vol = df_any_one_stock['Volume'].max()
fig3.add_trace(go.Bar(x=df_any_one_stock['Date'], y=df_any_one_stock['Volume'], name='Volume', marker={'color':df_any_one_stock['color']}),secondary_y=True)
fig3.update_yaxes(range=[0,max_vol*4],secondary_y=True)
fig3.update_yaxes(visible=False, secondary_y=True)
fig3.update_layout(xaxis_rangeslider_visible=False)  #hide range slider
#fig3.update_layout(title={'text':'TSLA', 'x':0.5})
fig3.show()

In [None]:
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(20,7), dpi= 80)
sns.boxplot(x='Year', y='Close', data=df_any_one_stock, ax=axes[0])
sns.boxplot(x='Month', y='Close', data=df_any_one_stock.loc[~df_any_one_stock.Year.isin([1991, 2008]), :])

# Set Title
axes[0].set_title('Year-wise Box Plot\n(The Trend)', fontsize=18); 
axes[1].set_title('Month-wise Box Plot\n(The Seasonality)', fontsize=18)
plt.show()

# Findout the time series is Stationary or non-Stationary

In [None]:
# For stationarity check I choosed 
# Augmented Dicky Fuller Test

# create a function for Augmented Dicky Fuller Test
def adfuller_test(data):

  #H0: Non-Stationary - Null hypothesis - We do this check for rejecting null hypothesis.
  #H1: Stationary     - Alternative hypothesis

    result = adfuller(data)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
  
    if result[1] > 0.05:
        print('Conclusion: Pvalue > 0.05 , so we fail to reject H0. We conclude the time series is non-stationary')
    else:
        print('Conclusion: Pvalue < 0.05 , so we reject H0. We conclude the time series is stationary')

In [None]:
# Call function for Augmented Dicky Fuller Test
adfuller_test(df_any_one_stock['Close'].dropna())

Now according to the above result, time series is non-Stationary then we need to make it as Stationary

In [None]:
# make stationary by calculating the first order difference
df_any_one_stock['diff'] = df_any_one_stock['Close']- df_any_one_stock['Close'].shift(1)

adfuller_test(df_any_one_stock['diff'].dropna())

# Check for Autocorrelation (ACF) & Partial Autocorrelation (PACF)

In [None]:
# ACF Plot and PACF plot

sm.graphics.tsa.plot_acf(df_any_one_stock['diff'].dropna(),lags =25)
sm.graphics.tsa.plot_pacf(df_any_one_stock['diff'].dropna(),lags =25)

plt.show()

In [None]:
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(20,7), dpi= 80)
sns.boxplot(x='Year', y='Target', data=df_any_one_stock, ax=axes[0])
sns.boxplot(x='Month', y='Target', data=df_any_one_stock.loc[~df_any_one_stock.Year.isin([1991, 2008]), :])

# Set Title
axes[0].set_title('Year-wise Box Plot\n(The Trend)', fontsize=18); 
axes[1].set_title('Month-wise Box Plot\n(The Seasonality)', fontsize=18)
plt.show()

# ECHO State Network Preparation

In [None]:
esn_data = df_prices['Close'].dropna()
esn_data.head(10)

# Data for ESN must be one dimentional

In [None]:
esndata = np.array(esn_data).astype('float64')
esndata

First, we create our echo state network implementation using some reasonable values and specify our training and validation length. We then create functions to calculate the mean squared error as well as the run an echo state network for specific input arguments of the spectral radius, noise, and the window length.

# Part 1: Predictions with the RC - Reservoir Computing (RC) RNN

In [None]:
# ESN Configs
n_reservoir= 500
sparsity=0.2
rand_seed=23
spectral_radius = 1.2
noise = .0005

# Initialize ESN
esn = ESN(n_inputs = 1,
      n_outputs = 1, 
      n_reservoir = n_reservoir,
      sparsity=sparsity,
      random_state=rand_seed,
      spectral_radius = spectral_radius,
      noise=noise)

trainlen = 1500
future = 10
futureTotal=100
pred_tot=np.zeros(futureTotal)

for i in range(0,futureTotal,future):
    pred_training = esn.fit(np.ones(trainlen),esndata[i:trainlen+i]) # DATA FEEDING HERE
    prediction = esn.predict(np.ones(future))
    pred_tot[i:i+future] = prediction[:,0]

Now we can simply run one function and obtain our prediction, and then we can plot this to see how well we did.

In [None]:
from matplotlib import rc
#rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
rc('text', usetex=False)

plt.figure(figsize=(16,8))
plt.plot(range(1000,trainlen+futureTotal),esndata[1000:trainlen+futureTotal],'b',label="Data", alpha=0.3) # data feeding
#plt.plot(range(0,trainlen),pred_training,'.g',  alpha=0.3)
plt.plot(range(trainlen,trainlen+futureTotal),pred_tot,'k',  alpha=0.8, label='Free Running ESN')

lo,hi = plt.ylim()
plt.plot([trainlen,trainlen],[lo+np.spacing(1),hi-np.spacing(1)],'k:', linewidth=4)

plt.title(r'Ground Truth and Echo State Network Output', fontsize=25)
plt.xlabel(r'Time (Days)', fontsize=20,labelpad=10)
plt.ylabel(r'Price ($)', fontsize=20,labelpad=10)
plt.legend(fontsize='xx-large', loc='best')
sns.despine()

# Part 2: Hyper-parameters optimization

# Note
We need to find the optimal values for spectral_radius, noise that gives the best prediction of this time series. In other words,we are looking for the hyper-parameters set that returns the minimum mean-square-error MSE. As previously, for each set of (spectral_radius, noise) we have to predict two time points ahead by using the previous 1500 points and do that for 100 steps as we did in part 1. These are essentially our validation set. Then calculate the MSE loss for the validation set and show through visualization which set gives the lowest MSE.
Hints:

* Build a function for the MSE
* Perform a grid search for the hyper parameters spectral_radius and noise
* Use the ranges: spectrum_radius = [0.5, 1.5] and noise=[0.0001, 0.01]
* For each set of (spectral_radius, noise) train the RC and make predictions as in part 1; these predictions are the validation sets
* For each validation set calculate the MSE and store it
* Make a 2D color plot to show the MSE for the different values of spectral_radius and noise

In order to obtain a result this good, we had to do a significant amount of hyperparameter optimization, here is the procedure that was done to obtain the hyperparameters used in the above results.

In [None]:
# Build a function for the MSE
def MSE(yhat, y):
    return np.sqrt(np.mean((yhat.flatten() - y)**2))

In [None]:
# Perform a grid search
n_reservoir= 500
sparsity   = 0.2
rand_seed  = 23
radius_set = [0.9,  1,  1.1]
noise_set = [ 0.001, 0.004, 0.006]

radius_set = [0.5, 0.7, 0.9,  1,  1.1,1.3,1.5]
noise_set = [ 0.0001, 0.0003,0.0007, 0.001, 0.003, 0.005, 0.007,0.01]

radius_set_size  = len(radius_set)
noise_set_size = len(noise_set)

trainlen = 1500
future = 2
futureTotal= 100

loss = np.zeros([radius_set_size, noise_set_size])

for l in range(radius_set_size):
    rho = radius_set[l]
    for j in range(noise_set_size):
        noise = noise_set[j]

        pred_tot=np.zeros(futureTotal)

        esn = ESN(n_inputs = 1,
          n_outputs = 1, 
          n_reservoir = n_reservoir,
          sparsity=sparsity,
          random_state=rand_seed,
          spectral_radius = rho,
          noise=noise)

        for i in range(0,futureTotal,future):
            pred_training = esn.fit(np.ones(trainlen),esndata[i:trainlen+i])  # data feeding here 
            prediction = esn.predict(np.ones(future))
            pred_tot[i:i+future] = prediction[:,0]
        
        # Calling mean-square-error MSE function
        loss[l, j] = MSE(pred_tot, esndata[trainlen:trainlen+futureTotal])     # data feeding here   
        print('rho = ', radius_set[l], ', noise = ', noise_set[j], ', MSE = ', loss[l][j] )

Make a 2D plot to show the MSE for the different values of spectral_radius and noise

In [None]:
plt.figure(figsize=(16,8))
im = plt.imshow(loss.T, vmin=abs(loss).min(), vmax=abs(loss).max(), origin='lower',cmap='PuRd')
plt.xticks(np.linspace(0,radius_set_size-1,radius_set_size), radius_set);
plt.yticks(np.linspace(0,noise_set_size-1, noise_set_size), noise_set);
plt.xlabel(r'$\rho$', fontsize=16); plt.ylabel('noise', fontsize=16); 

# im.set_interpolation('bilinear')
cb = plt.colorbar(im);

# What is the optimal set?

In [None]:
minLoss = np.min(loss)
index_min = np.where(loss == minLoss)
index_min
rho_opt = radius_set[int(index_min[0])]
noise_opt = noise_set[int(index_min[1])]
print('The optimal set is:\nspectrum radius = ',
      rho_opt,'\nnoise = ',noise_opt,'\nMSE = ',minLoss)

# Explore the RC ability in long predictions
In the previous section we made predictions for two time points ahead by using the previous 1500 points and for 100 future points in total. In this section we are asking to explore if the prediction window (we used 2 in part 1 and part 2) affects the predictability. Using the optimal hyper parameters we found in part 2, perform the analysis for windows: 1, 2, 5, and 10. Does the MSE in the validation set depends on this window?

## Hints:

* Use the optimal set of hyper parameters that we found on part 2
* For this set repeat the calculation of the part 1 for different prediction windows and for the optimal window. In particular, instead of predicting two time points ahead we have to predict for 1, 2, 5, 10.
* For each prediction calculate the MSE in the validation set
* Plot the MSE as a function of the prediction window

In [None]:
n_reservoir= 500
sparsity=0.2
rand_seed=23
spectral_radius = 1.1
noise = .0001

# let k_set be an array with the prediction-window values
k_set = [1,2,5,10]

k_size = len(k_set)
loss_k = np.zeros(k_size)

for l in range(k_size):
    future = k_set[l]

    trainlen = 1500    
    futureTotal=100
    
    pred_tot=np.zeros(futureTotal)

    esn = ESN(n_inputs = 1,
          n_outputs = 1, 
          n_reservoir = n_reservoir,
          sparsity=sparsity,
          random_state=rand_seed,
          spectral_radius = spectral_radius,
          noise=noise)


    for i in range(0,futureTotal,future):
        pred_training = esn.fit(np.ones(trainlen),esndata[i:trainlen+i])
        prediction = esn.predict(np.ones(future))
        pred_tot[i:i+future] = prediction[:,0]
        
    loss_k[l] = MSE(pred_tot, esndata[trainlen:trainlen+futureTotal])

## Plot the MSE as a function of the prediction window

In [None]:
plt.figure(figsize=(14,8))
plt.plot(k_set, loss_k,'-ok', alpha=0.6)
plt.title('MSE as a Function of Window Length', fontsize=20)
plt.xlabel('Window Length', fontsize=18)
plt.ylabel('MSE', fontsize=18)
sns.despine()

## Conclution

In the future predictions the error propagates in time and thus it increases in time. This is the reason that as longer is a prediction as more difficult to make it. We can see this behavior in the plot above, where the MSE is an increasing monotonically function of the prediction-window, hence longer predictions mean larger MSE.