# Time, Series Analysis To Predict Sunspots

<img src='https://www.almanac.com/sites/default/files/styles/primary_image_in_article/public/image_nodes/sunspots.jpg?itok=6Fx0Px0U' alt='Sunspots' width='500' height='500'>
<br><br>
<b>Sunspots</b> are areas that appear dark on the surface of the Sun. They appear dark because they are cooler than other parts of the Sun’s surface. The temperature of a sunspot is still very hot though — around 6,500 degrees Fahrenheit!<br>
Sunspots are used to keep track of the solar cycle. The solar cycle is the cycle that the Sun’s magnetic field goes through approximately every 11 years.

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
import csv
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping,LearningRateScheduler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Bidirectional,Lambda,Conv1D,Dropout
from tensorflow.keras.optimizers import Adam,SGD
from tensorflow.keras.metrics import mean_absolute_error,mean_squared_error
from tensorflow.keras.losses import Huber
from tensorflow.keras.utils import plot_model

# Reading Data

In [None]:
time=[]
sunspots=[]
with open("../input/sunspots/Sunspots.csv") as f:
    reader = csv.reader(f,delimiter=',')
    next(reader)
    for row in reader:
        time.append(row[0])
        sunspots.append(row[2])

series = np.array(sunspots).astype(float)
time = np.array(time).astype(int)

# Exploratory Data Analysis

**Time Series** is the ordered sequnce of values spaced over equal interval of time.

In [None]:
#Plot Time vs Series
def plot_series(time,series):
    plt.title("Variation of Sunspots with Time")
    sns.lineplot(time,series)
    plt.xlabel("Time")
    plt.ylabel("Value")
plt.figure(figsize=(12,6))
plot_series(time,series)

In [None]:
#Autocorrelation Plot
fig,ax = plt.subplots(1,2,figsize=(15,6))
auto = plot_acf(series,ax=ax[0])
partial = plot_pacf(series,ax=ax[1])
plt.show()

**ACF** : It is a auto-correlation function which gives us values of auto-correlation of any series with its lagged values.It describes how well the present value of the series is related with its past values.
Here ACF is significant for about **30 values**.This means value depends on previous 30 values.<br>
**PACF** : The "partial" correlation between two variables is the amount of correlation between them which is not explained by their mutual correlations with a specified set of other variables.Here PACF is significant for about **6 values.**

# Preparing Test and Val Data

We have to split our time series into training and validation period. The split time is 3000 means from 0 to 3000 will be for training and 3000 till the end is for validation.

In [None]:
split_time = 3000
time_train = time[:split_time]
x_train = series[:split_time]
time_valid = time[split_time:]
x_valid = series[split_time:]
split_time

In [None]:
#Parameters
window_size = 60
batch_size = 100
shuffle_buffer = 1000

Now we will define a function to create a windowed dataset. In a window dataset, the previous n values could be seen as the input features. And the current value with any timestamp is the output label. Window dataset consconsists of fixed window size.

In [None]:
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
    d = tf.data.Dataset.from_tensor_slices(series)
    d = d.window(window_size + 1, shift=1, drop_remainder=True)
    d = d.flat_map(lambda w: w.batch(window_size + 1))
    d = d.shuffle(shuffle_buffer)
    d = d.map(lambda w: (w[:-1], w[1:]))
    d = d.batch(batch_size).prefetch(1)
    return d

We will also define a function to make a forecast based on our model.

In [None]:
def model_forecast(model,series,batch_size,window_size):
    d = tf.data.Dataset.from_tensor_slices(series)
    d = d.window(window_size, shift=1, drop_remainder=True)
    d = d.flat_map(lambda w: w.batch(window_size))
    d = d.batch(batch_size).prefetch(1)
    forecast = model.predict(d)
    return forecast

# Time Series Prediction Model

In [None]:
tf.keras.backend.clear_session()

train = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer)
val = windowed_dataset(x_valid, window_size, batch_size, shuffle_buffer)

model = Sequential()
model.add(Lambda(lambda x:tf.expand_dims(x,axis=-1),input_shape=[None]))
model.add(Conv1D(filters=60,kernel_size=5,strides=1,padding='causal',activation='relu'))
model.add(LSTM(120,return_sequences=True))
model.add(LSTM(120,return_sequences=True))
model.add(Dense(60,activation='relu'))
model.add(Dense(30,activation='relu'))
model.add(Dense(1))
model.add(Lambda(lambda x:x*400))

lr_schedule = LearningRateScheduler(lambda epoch : 1e-8 * 10**(epoch / 20))
model.compile(loss=Huber(),optimizer=SGD(lr=1e-8,momentum=0.9),metrics=['mae'])
history = model.fit(train, epochs=100,validation_data=val,callbacks=[lr_schedule])

After trainig the model using Learning Rate Scheduler, lets plots the grpah of "learning rate" vs "loss". This will help us to select the best learning rate of all.

In [None]:
#Plot for selecting learning rate
plt.semilogx(history.history["lr"], history.history["loss"])
plt.axis([1e-8, 1e-3, 0, 100])
plt.xlabel("Epochs")
plt.ylabel("Loss")

 From this we select learning rate to be **8e-6**

In [None]:
#Final Model with lr=8e-6

tf.keras.backend.clear_session()

train = windowed_dataset(x_train,window_size,batch_size,shuffle_buffer)
val = windowed_dataset(x_valid,window_size,batch_size,shuffle_buffer)

model = Sequential()
model.add(Lambda(lambda x:tf.expand_dims(x,axis=-1),input_shape=[None]))
model.add(Conv1D(filters=60,kernel_size=5,strides=1,padding='causal',activation='relu'))
model.add(LSTM(120,return_sequences=True))
model.add(LSTM(120,return_sequences=True))
model.add(Dense(60,activation='relu'))
model.add(Dense(30,activation='relu'))
model.add(Dense(1))
model.add(Lambda(lambda x:x*400))

model.compile(loss=Huber(),optimizer=SGD(lr=8e-6,momentum=0.9),metrics=['mae'])
history = model.fit(train, epochs=200,validation_data=val)

Lets plot the graph between :
* "mae" vs "validation mae"
* "loss" vs "validation loss"

In [None]:
#Plotting graphs for mae and loss
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history["val_"+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string,"val_"+string])
    plt.show()

plt.figure(figsize=(12,6))
plot_graphs(history,'mae')
plt.figure(figsize=(12,6))
plot_graphs(history,'loss')
plt.show()

In [None]:
#Forecast
forecast = model_forecast(model,series[..., np.newaxis],batch_size,window_size)
forecast = forecast[split_time - window_size:-1,-1,0]

In [None]:
#Predicted Plot
plt.figure(figsize=(12, 6))
plot_series(time_valid, x_valid)
plot_series(time_valid, forecast)
plt.legend(["Actual","Forecast"])

# Result

In [None]:
print("Mean Absolute Error: ",mean_absolute_error(x_valid,forecast).numpy())
print("Mean Squared Error:",mean_squared_error(x_valid,forecast).numpy())