> ## ** Imports and data loading**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pytz import timezone
import pytz

%matplotlib inline

plt.style.use('fivethirtyeight')
train = pd.read_csv('../input/SolarPrediction.csv')

## **Initial observations**

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.columns

In [None]:
train.dtypes

## **Data preprocessing**

### **UNIX Time to Datetime Transformation**

In [None]:
#hawaii = timezone('Pacific/Honolulu')

# Creamos una copia del original
train_origial = train.copy()
df = train.copy()

train_origial.index = pd.to_datetime(df['UNIXTime'], unit='s')
#df.index= df.index.tz_localize(pytz.utc).tz_convert(hawaii)

train['DateTime'] = train_origial.index
train_origial['DateTime'] = train_origial.index 
train.head()

### **Generation DataFrame of Radiation**

In [None]:
train_radiation = train.drop(['UNIXTime', 'Data', 'Time', 'Temperature','TimeSunRise', 'TimeSunSet',
                         'Pressure', 'Humidity', 'WindDirection(Degrees)', 'Speed' ], axis=1)

### **Time features**

In [None]:
for i in (train_radiation, train_origial):
    i['year'] = i.DateTime.dt.year
    i['month'] = i.DateTime.dt.month
    i['day'] = i.DateTime.dt.day
    i['Hour'] = i.DateTime.dt.hour

In [None]:
train_radiation['Day of week'] = train_radiation['DateTime'].dt.dayofweek
temp_rad = train_radiation['DateTime']

In [None]:
# Funcion para saber si es fin de semana o no, poco relevante ...
def applyer(row):
    if row.dayofweek == 5 or row.dayofweek == 6:
        return 1
    else:
        return 0
temp2 = train_radiation['DateTime'].apply(applyer)
train_radiation['weekend'] = temp2

### **Change Index for datetime data type**

In [None]:
train_radiation.index = train_radiation['DateTime']

### **Preprocessing result**

In [None]:
train_radiation.head()

## **Radiation Analysis and Data Visualization**

In [None]:
df_rad = train_radiation.drop('DateTime', 1)
ts = df_rad['Radiation']
plt.figure(figsize= (20,5))
plt.title('Radiation vs Time')
plt.xlabel('Time (Year-Month-Day))')
plt.ylabel('Radiation level')
plt.plot(ts)

### **Clean index**

In [None]:
train_radiation['Date']=pd.to_datetime(train_radiation.DateTime).dt.strftime('%Y-%m-%d')
train_radiation.index = train_radiation.Date
train_radiation.head()

In [None]:
train_radiation.groupby('month')['Radiation'].mean().plot.bar(figsize = (20,5),
                                                              title = 'Monthly Average Radiation',
                                                              fontsize = 14)

In [None]:
temp = train_radiation.groupby(['day'])['Radiation'].mean()
temp.plot(figsize = (20,5), title = "Average per day radiation Month", fontsize = 14)

In [None]:
temp = train_radiation.groupby(['day', 'Hour'])['Radiation'].mean()
temp.plot(figsize = (20,5), title = "Average Radiation per Daily, Hour", fontsize = 14)

In [None]:
temp = train_radiation.groupby(['Hour'])['Radiation'].mean()
temp.plot(figsize = (20,5), title = "Average Radiation per Hour", fontsize = 14)

In [None]:
train_radiation.groupby('Day of week')['Radiation'].mean().plot.bar(figsize = (20,6),
                                                                   title = 'Average radiation per day per week')

## **Visualizacion descompuesta por peridos**

In [None]:
train_radiation['Timestamp'] = pd.to_datetime(train_radiation.DateTime, format = '%d-%m-%y %H:%M')
train_radiation.index = train_radiation.Timestamp

#Hourly
hourly = train_radiation.resample('H').mean()

#Daily
daily = train_radiation.resample('D').mean()

#Weekly
weekly = train_radiation.resample('W').mean()
    
#Monthly
monthly = train_radiation.resample('M').mean()

In [None]:
ig,axs = plt.subplots(4,1)

hourly.Radiation.plot(figsize = (15,8), title = "Hourly", fontsize = 14, ax = axs[0])
daily.Radiation.plot(figsize = (15,8), title = "Daily", fontsize = 14, ax = axs[1])
weekly.Radiation.plot(figsize = (15,8), title = "Weekly", fontsize = 14, ax = axs[2])
monthly.Radiation.plot(figsize = (15,8), title = "Monthly", fontsize = 14, ax = axs[3])
plt.tight_layout()

## **Transformation and visualization of regular data**

It can be seen that the most regular data is between October and the end of November beginning of December. Since if we see the graphs above, between December and January there are missing dataframe data, and the same happens in September, so these ranges of analysis are discarded.

In [None]:
From = '2016-10-01'
To   = '2016-12-01'

hourly = hourly.loc[From:To,:]
daily = daily.loc[From:To,:]
weekly = weekly.loc[From:To,:] 
monthly = monthly.loc[From:To,:] 

ig,axs = plt.subplots(4,1)
hourly.Radiation.plot(figsize = (15,8), title = "Hourly", fontsize = 14, ax = axs[0])
daily.Radiation.plot(figsize = (15,8), title = "Daily", fontsize = 14, ax = axs[1])
weekly.Radiation.plot(figsize = (15,8), title = "Weekly", fontsize = 14, ax = axs[2])
monthly.Radiation.plot(figsize = (15,8), title = "Monthly", fontsize = 14, ax = axs[3])
plt.tight_layout()

## **Look at stationarity**

It is assumed that the data of the underlying time series are stationary. This assumption gives us some ** nice ** statistical properties that allow us to use several models for forecasting.

Stationary is a statistical assumption that a time series has:

  - __Media constant__
  - __Constant balance__
  - __The autocovariedad does not depend on the time__

In short, if we use past data to predict future data, we must assume that the data will follow the same general trends and patterns as in the past. This general statement is valid for most training data and modeling tasks.

Sometimes we need to transform the data to make it stationary. However, this transformation then questions whether these data are really stationary and can be modeled using these techniques.

Source: https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/

### **Stationary series test function**

In [None]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(df, ts):
    # Determining rolling statics
    rolmean = df[ts].rolling(window = 12, center = False).mean()
    rolstd = df[ts].rolling(window = 12, center = False).std()
    
    # Plot rolling statistics
    orig = plt.plot(df[ts], color = 'blue', label = 'Original')
    mean = plt.plot(rolmean, color = 'red' , label = 'Promedio')
    std = plt.plot(rolstd, color = 'black', label = 'Desviacion Estandar')
    
    plt.legend(loc = 'best')
    plt.title('Promedio y Desviacion Estandar para %s' %(ts))
    plt.xticks(rotation = 45)
    plt.show(block = False)
    plt.close()
    
    # Perform Dickey-Fuller test:
    # Null Hypothesis (H_0): time series is not stationary
    # Alternate Hypothesis (H_1): time series is stationary
    
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(df[ts], autolag='AIC')
    dfoutput = pd.Series(dftest[0:4],
                         index = ['Test Statistic',
                                  'p-value',
                                  '# Lags Used',
                                  'Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)' %key] = value
    print(dfoutput)

In [None]:
test_stationarity(df = train_radiation, ts = 'Radiation')

## **Hypothesis**

### What does the Dickey-Fuller Test tell us?

This is one of the statistical tests to verify the stationarity. Here the null hypothesis is that the time series is not stationary. The results of the tests include a test statistic (test statistic) and some critical values for the difference confidence levels. If the "Test Statistic" is smaller than the "Critical Value 1%", the null hypothesis is rejected, therefore the series is stationary.

In summary,

 Critical-Value = cv = -3.43 |
 Test Statistic = ts = -23.77

* H0: It is not stationary; ts> cv
* H1: It is stationary; ts <cv

Therefore, H0 is rejected, which indicates that the series ** is stationary **.
 
Source: https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/

## **Predictions**

### **Imports**

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
import statsmodels.api as sm
from statsmodels.tsa.api import Holt, ExponentialSmoothing, SimpleExpSmoothing
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA

### **Divide data for training and validation**

In [None]:
_train = hourly.loc['2016-10-02':'2016-11-13',:]
valid = hourly.loc['2016-11-14': '2016-11-28',:]

In [None]:
_train.head()

In [None]:
valid.head()

In [None]:
_train.Radiation.plot(figsize=(25,5), title = 'Radiacion Diaria', fontsize=14, label='Train')
valid.Radiation.plot(figsize=(25,5), title = 'Radiacion Diaria', fontsize=14, label='Valid')
plt.xlabel('DateTime')
plt.ylabel('Radiation')
plt.legend(loc = 'best')

### **Decomposition by season**

In [None]:
plt.style.use('default')
plt.figure(figsize = (16,8))
sm.tsa.seasonal_decompose(_train.Radiation).plot()
result = sm.tsa.stattools.adfuller(_train.Radiation)
plt.show()

### **Naive Approach**

In [None]:
dd = np.asarray(_train.Radiation)
y_hat =valid.copy()
y_hat['naive'] = dd[len(dd)- 1]
plt.figure(figsize = (25,5))
plt.plot(_train.index, _train['Radiation'],label = 'Train')
plt.plot(valid.index, valid['Radiation'], label = 'Validation')
plt.plot(y_hat.index, y_hat['naive'],  label = 'Naive')
plt.legend(loc = 'best')
plt.tick_params(axis = 'x', rotation = 45)

####  ** Error RMS for Naive Approach**

In [None]:
rmse = sqrt(mean_squared_error(valid['Radiation'], y_hat['naive']))
rmse

### **Holt Linear**

In [None]:
y_hat_holt = valid.copy()
fit1 = Holt(np.asarray(_train['Radiation'])).fit(smoothing_level = 0.01, smoothing_slope = 0.1)
y_hat_holt['Holt_linear'] = fit1.forecast(len(valid))
plt.style.use('fivethirtyeight')
plt.figure(figsize=(25,5))
plt.plot(_train.index, _train['Radiation'],label = 'Train')
plt.plot(valid.index, valid['Radiation'], label = 'Validation')
plt.plot(y_hat.index, y_hat_holt['Holt_linear'], label = 'Holt Linear')
plt.legend(loc='best')

####  ** Error RMS for Holt Linear**

In [None]:
rmse = sqrt(mean_squared_error(valid['Radiation'],  y_hat_holt.Holt_linear))
rmse

### **Simple Exponential Smoothing**

In [None]:
y_hat_avg2 = valid.copy()
fit2 = SimpleExpSmoothing(np.asarray(_train['Radiation'])).fit(smoothing_level=0.02,optimized=False)
y_hat_avg2['SES'] = fit2.forecast(len(valid))
plt.figure(figsize=(25,5))
plt.plot(_train['Radiation'], label='Train')
plt.plot(valid['Radiation'], label='Test')
plt.plot(y_hat_avg2['SES'], label='SES')
plt.legend(loc='best')
plt.show()

####  ** Error RMS for Simple Exponentian Smoothing**

In [None]:
rms = sqrt(mean_squared_error(valid.Radiation, y_hat_avg2.SES))
print("Error: ", rms)

### **Holt Winter**


In [None]:
y_hat_avg = valid.copy()
fit1 = ExponentialSmoothing(np.asarray(_train['Radiation']), seasonal_periods=4, trend = 'add', seasonal= 'add').fit()
y_hat_avg['Holt_Winter'] = fit1.forecast(len(valid))
plt.figure(figsize = (25,5))
plt.plot(_train.index, _train['Radiation'],label = 'Train')
plt.plot(valid.index, valid['Radiation'], label = 'Validation')
plt.plot(y_hat_avg.index, y_hat_avg['Holt_Winter'], label = 'Holt_Winter')
plt.legend(loc = 'best')

####  ** Error RMS for Holt Winter**

In [None]:
rms = sqrt(mean_squared_error(valid.Radiation, y_hat_avg.Holt_Winter))
print("error: ", rms)

### **SARIMAX & ARIMA**

The ARIMA forecast for a stationary time series is no more than a linear equation (like a linear regression). The predictors depend on the parameters (p, d, q) of the ARIMA model:

   - ** Number of AR terms (autoregressive) (p): ** AR terms are only delays of the dependent variable. For example, if p is 5, the predictors for x (t) will be x (t-1) ... .x (t-5).
   - ** Number of MA terms (moving average) (q): ** MA terms are delayed forecast errors in the prediction equation. For example, if q is 5, the predictors for x (t) will be e (t-1) ... .e (t-5) where e (i) is the difference between the moving average at the instantaneous moment and the real value.
   - ** Number of differences (d): ** are the number of non-seasonal differences, that is, in this case we take the difference of first order.
   
   Source: https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/

#### ACF and PACF Plots
** How do we determine p, d and q? ** For p and q, we can use ACF and PACF graphs (below).

** Autocorrelation function (ACF) **. Correlation between the time series with a delayed version of itself.

** Partial autocorrelation function (PACF) **. Additional correlation explained by each successive lagged term.

** How do we interpret the ACF and PACF graphs? **

- p - Delay value where the PACF graph crosses the upper confidence interval for the first time.
- q - Delay value where the ACF graph crosses the upper confidence interval for the first time.

In [None]:
def plot_acf_pacf(df, ts):
  """
  Plot auto-correlation function (ACF) and partial auto-correlation (PACF) plots
  """
  f, (ax1, ax2) = plt.subplots(2,1, figsize = (10, 5)) 

  #Plot ACF: 

  ax1.plot(lag_acf)
  ax1.axhline(y=0,linestyle='--',color='gray')
  ax1.axhline(y=-1.96/np.sqrt(len(df[ts])),linestyle='--',color='gray')
  ax1.axhline(y=1.96/np.sqrt(len(df[ts])),linestyle='--',color='gray')
  ax1.set_title('Autocorrelation Function for %s' %(ts))

  #Plot PACF:
  ax2.plot(lag_pacf)
  ax2.axhline(y=0,linestyle='--',color='gray')
  ax2.axhline(y=-1.96/np.sqrt(len(df[ts])),linestyle='--',color='gray')
  ax2.axhline(y=1.96/np.sqrt(len(df[ts])),linestyle='--',color='gray')
  ax2.set_title('Partial Autocorrelation Function for %s' %(ts))
  
  plt.tight_layout()
  plt.show()
  plt.close()
  
  return

In [None]:
lag_acf = acf(np.array(_train['Radiation']), nlags = 20)
lag_pacf = pacf(np.array(_train['Radiation']), nlags = 20, method='ols')

plot_acf_pacf(df = _train, ts = 'Radiation')

### **Conclusions**

As seen in the PACF chart, the largest amount of partial correlation is between 1 and 0, so ** p has a value of 1 **

The same happens in the ACF chart, so ** has a value of 1 **.

### **SARIMAX(1,0,1)**

In [None]:
fit2 = sm.tsa.statespace.SARIMAX(_train.Radiation, order=(1,0,1),seasonal_order=(1,1,0,12), trend='ct')
res = fit2.fit()
y_hat_avg['SARIMA'] = res.predict(start="2016-11-14", end="2016-11-29", dynamic=True)
plt.figure(figsize=(20,5))
plt.plot( _train['Radiation'], label='Train')
plt.plot(valid['Radiation'], label='Test')
plt.plot(y_hat_avg['SARIMA'], label='SARIMA')
plt.legend(loc='best')

In [None]:
res.summary()

In [None]:
rms = sqrt(mean_squared_error(valid.Radiation, y_hat_avg['SARIMA']))
print('Error:', rms)

### **ARIMA(1,0,1)**

In [None]:
model = ARIMA(_train.Radiation, order=(1, 0, 1))  
results_MA = model.fit()  
plt.plot(_train.Radiation)
plt.plot(results_MA.fittedvalues, color='red')

In [None]:
results_MA.summary()

### Finally
We include a seasonal effect in an additive way, which means that we add a term that allows the process to depend on the fourth MA delay. It may be that, on the contrary, we want to model a seasonal effect in a multiplicative way. We often write the model then as ARIMA (p, d, q) × (P, D, Q) s, where the letters of low intensity indicate the specification for the non-seasonal component, and uppercase letters indicate the specification of the Season component s is the periodicity of the stations (for example, it is often 4 for quarterly data or 12 for monthly data).

As it says above, it was used as parameter 12, since it is a monthly data

# EXTRA LSTM
## Recurrent Neural Networks

  - https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/
  - https://blog.statsbot.co/time-series-prediction-using-recurrent-neural-networks-lstms-807fa6ca7f
  - https://adventuresinmachinelearning.com/recurrent-neural-networks-lstm-tutorial-tensorflow/

In [None]:
def do_lstm_model(df, 
                  ts, 
                  look_back, 
                  epochs, 
                  type_ = None, 
                  train_fraction = 0.67):
  """
   Create LSTM model
   Source: https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/
  """
  # Import packages
  import numpy
  import matplotlib.pyplot as plt
  from pandas import read_csv
  import math
  from keras.models import Sequential
  from keras.layers import Dense
  from keras.layers import LSTM
  from sklearn.preprocessing import MinMaxScaler
  from sklearn.metrics import mean_squared_error

  # Convert an array of values into a dataset matrix
  def create_dataset(dataset, look_back=1):
    """
    Create the dataset
    """
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
      a = dataset[i:(i+look_back), 0]
      dataX.append(a)
      dataY.append(dataset[i + look_back, 0])
    return numpy.array(dataX), numpy.array(dataY)

  # Fix random seed for reproducibility
  numpy.random.seed(7)

  # Get dataset
  dataset = df[ts].values
  dataset = dataset.astype('float32')

  # Normalize the dataset
  scaler = MinMaxScaler(feature_range=(0, 1))
  dataset = scaler.fit_transform(dataset.reshape(-1, 1))
  
  # Split into train and test sets
  train_size = int(len(dataset) * train_fraction)
  test_size = len(dataset) - train_size
  train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
  
  # Reshape into X=t and Y=t+1
  look_back = look_back
  trainX, trainY = create_dataset(train, look_back)
  testX, testY = create_dataset(test, look_back)
  
  # Reshape input to be [samples, time steps, features]
  if type_ == 'regression with time steps':
    trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], 1))
  elif type_ == 'stacked with memory between batches':
    trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], 1))
  else:
    trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
  
  # Create and fit the LSTM network
  batch_size = 1
  model = Sequential()
  
  if type_ == 'regression with time steps':
    model.add(LSTM(4, input_shape=(look_back, 1)))
  elif type_ == 'memory between batches':
    model.add(LSTM(4, batch_input_shape=(batch_size, look_back, 1), stateful=True))
  elif type_ == 'stacked with memory between batches':
    model.add(LSTM(4, batch_input_shape=(batch_size, look_back, 1), stateful=True, return_sequences=True))
    model.add(LSTM(4, batch_input_shape=(batch_size, look_back, 1), stateful=True))
  else:
    model.add(LSTM(4, input_shape=(1, look_back)))
  
  model.add(Dense(1))
  model.compile(loss='mean_squared_error', optimizer='adam')

  if type_ == 'memory between batches' or type_ == 'stacked with memory between batches':
    for i in range(100):
      model.fit(trainX, trainY, epochs=1, batch_size=batch_size, verbose=2, shuffle=False)
      model.reset_states()
  else:
    model.fit(trainX, 
              trainY, 
              epochs = epochs, 
              batch_size = 1, 
              verbose = 2)
  
  # Make predictions
  if type_ == 'memory between batches' or type_ == 'stacked with memory between batches':
    trainPredict = model.predict(trainX, batch_size=batch_size)
    testPredict = model.predict(testX, batch_size=batch_size)
  else:
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)
  
  # Invert predictions
  trainPredict = scaler.inverse_transform(trainPredict)
  trainY = scaler.inverse_transform([trainY])
  testPredict = scaler.inverse_transform(testPredict)
  testY = scaler.inverse_transform([testY])
  
  # Calculate root mean squared error
  trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
  print('Train Score: %.2f RMSE' % (trainScore))
  testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
  print('Test Score: %.2f RMSE' % (testScore))
  
  # Shift train predictions for plotting
  trainPredictPlot = numpy.empty_like(dataset)
  trainPredictPlot[:, :] = numpy.nan
  trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
  
  # Shift test predictions for plotting
  testPredictPlot = numpy.empty_like(dataset)
  testPredictPlot[:, :] = numpy.nan
  testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
  
  # Plot baseline and predictions
  plt.plot(scaler.inverse_transform(dataset))
  plt.plot(trainPredictPlot)
  plt.plot(testPredictPlot)
  plt.show()
  plt.close()
  
  return

In [None]:
# LSTM Network for Regression
do_lstm_model(df = train_radiation, 
              ts = 'Radiation', 
              look_back = 1, 
              epochs = 5)