In [2]:
import pandas as pd
import os

# Replacce the following path with your own file. It can be downloaded from:
# http://www.sidc.be/silso/INFO/sndtotcsv.php
path = "./data/"
    
filename = os.path.join(path,"SN_d_tot_V2.0.csv")    
names = ['year', 'month', 'day', 'dec_year', 'sn_value' , 'sn_error', 'obs_num']
df = pd.read_csv(filename,sep=';',header=None,names=names,na_values=['-1'], index_col=False)

print("Starting file:")
print(df[0:10])

print("Ending file:")
print(df[-10:])

Starting file:
   year  month  day  dec_year  sn_value  sn_error  obs_num
0  1818      1    1  1818.001        -1       NaN        0
1  1818      1    2  1818.004        -1       NaN        0
2  1818      1    3  1818.007        -1       NaN        0
3  1818      1    4  1818.010        -1       NaN        0
4  1818      1    5  1818.012        -1       NaN        0
5  1818      1    6  1818.015        -1       NaN        0
6  1818      1    7  1818.018        -1       NaN        0
7  1818      1    8  1818.021        65      10.2        1
8  1818      1    9  1818.023        -1       NaN        0
9  1818      1   10  1818.026        -1       NaN        0
Ending file:
       year  month  day  dec_year  sn_value  sn_error  obs_num
73373  2018     11   21  2018.889         0       0.0       31
73374  2018     11   22  2018.892         0       0.0       15
73375  2018     11   23  2018.895         0       0.0       22
73376  2018     11   24  2018.897        15       2.7        8
73377  2

In [3]:
start_id = max(df[df['obs_num'] == 0].index.tolist())+1  # Find the last zero and move one beyond
print(start_id)
df = df[start_id:] # Trim the rows that have missing observations

11314


In [4]:
df_train = df[df['year']<2000]
df_test = df[df['year']>=2000]

spots_train = df_train['sn_value'].tolist()
spots_test = df_test['sn_value'].tolist()

print("Training set has {} observations.".format(len(spots_train)))
print("Test set has {} observations.".format(len(spots_test)))

Training set has 55160 observations.
Test set has 6909 observations.


In [5]:
import numpy as np

def to_sequences(seq_size, obs):
    x = []
    y = []

    for i in range(len(obs)-SEQUENCE_SIZE-1):
        #print(i)
        window = obs[i:(i+SEQUENCE_SIZE)]
        after_window = obs[i+SEQUENCE_SIZE]
        window = [[x] for x in window]
        #print("{} - {}".format(window,after_window))
        x.append(window)
        y.append(after_window)
        
    return np.array(x),np.array(y)
    
    
SEQUENCE_SIZE = 13
x_train,y_train = to_sequences(SEQUENCE_SIZE,spots_train)
x_test,y_test = to_sequences(SEQUENCE_SIZE,spots_test)

print("Shape of training set: {}".format(x_train.shape))
print("Shape of test set: {}".format(x_test.shape))

Shape of training set: (55146, 13, 1)
Shape of test set: (6895, 13, 1)


In [7]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from keras.callbacks import EarlyStopping
import numpy as np

print('Build model...')
model = Sequential()
model.add(LSTM(64, dropout=0.0, recurrent_dropout=0.0,input_shape=(None, 1)))
model.add(Dense(32))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
print('Train...')

model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=1000)

Build model...
Train...
Train on 55146 samples, validate on 6895 samples
Epoch 1/1000
 - 36s - loss: 1275.3469 - val_loss: 213.4907
Epoch 2/1000
 - 26s - loss: 517.8004 - val_loss: 210.9896
Epoch 3/1000
 - 35s - loss: 510.5297 - val_loss: 204.2945
Epoch 4/1000
 - 35s - loss: 505.8942 - val_loss: 211.2363
Epoch 5/1000
 - 35s - loss: 502.0622 - val_loss: 202.2300
Epoch 6/1000
 - 43s - loss: 502.1896 - val_loss: 226.6865
Epoch 7/1000
 - 35s - loss: 500.8095 - val_loss: 205.8072
Epoch 8/1000
 - 38s - loss: 497.5710 - val_loss: 198.3519
Epoch 9/1000
 - 29s - loss: 495.5105 - val_loss: 225.0722
Epoch 10/1000
 - 35s - loss: 499.5712 - val_loss: 221.1868
Epoch 11/1000
 - 35s - loss: 497.2455 - val_loss: 214.2965
Epoch 12/1000
 - 33s - loss: 500.9527 - val_loss: 205.9829
Epoch 13/1000
 - 35s - loss: 498.3945 - val_loss: 197.7749
Epoch 14/1000
 - 43s - loss: 496.6274 - val_loss: 205.2819
Epoch 15/1000
 - 35s - loss: 494.4255 - val_loss: 195.1709
Epoch 16/1000
 - 43s - loss: 495.7825 - val_loss: 

<keras.callbacks.History at 0x129c2208>

In [8]:
from sklearn import metrics

pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Score (RMSE): {}".format(score))


Score (RMSE): 14.205087990645266
