In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read Data.

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
mpl.rcParams['figure.figsize'] = (15, 7)
mpl.rcParams['axes.grid'] = False

In [None]:
mpl.rcParams['figure.figsize'] = (20, 7)
mpl.rcParams['axes.grid'] = False

In [None]:
df = pd.read_csv("/kaggle/input/sunspots/Sunspots.csv")
df.head()

In [None]:
df.info()

We can see from info, Date column is stored as object i.e. string data type . Date column must be converted into datatime format which makes it easier for working with date and time data.There is an unnecessary column named 'Unnamed :0' which has to be removed.

* df['Date']=pd.to_datetime(df['Date']) <-- can be used to convert a column to into datetime data type column
* pd.drop can be used for dropping the unnnecesary column
* Here, I am using 'usecols' argument inside pd.read_csv for selecting only required column.
* 'parse_date', & 'date_parser' arguments for converting Date column into datetime data type.
* inside 'parse_data, we have to pass the column to be conveted into datetime, here, it is 'Date' column.
* 'dateparse' function below is requied which is basically converting any argument passed to it into datetime data type . This is given to * 'data_parser' inside pd.read_csv.
* check the documentation of pd.read_csv, there are more than 15 arguments, which can be used to perform many operations while importing the data itself.

In [None]:
from dateutil.parser import parse
dateparse=lambda dates:parse(dates)

In [None]:
df = pd.read_csv('/kaggle/input/sunspots/Sunspots.csv',usecols=['Date','Monthly Mean Total Sunspot Number'],parse_dates=['Date'],date_parser=dateparse)
df.head()

In [None]:
df.info() ## Checking the info again : data type of Date column --> has conveted into datetime

# 1. Exploratory Data Analysis

In [None]:
df_non_index=df.copy() # Making a copy of initial data.Both will be used as required
# The 'df_non_index' dataframe is used for some exploratory data analysis  
# Later we will convert Date colum as index in  'df' dataframe.

## Profit of datetime formated data:

* You can do lot of date and time related operations easily without doing string opeations.
* Here month is seprated and kept in another column named month so easily
* Afeter that year is seperated and used.

In [None]:
df_non_index['Month']=df_non_index.Date.dt.month
df_non_index.head()

* The following code is extracting the each year of the decade, for example in string '1749' last character i.e. (3rd positional) is year 9 of that decade, which has been extracded and kept in another column named 'nth_year.
* for '1748' it wil be year 8.
* But for '1750' it will be year '0' which has to be 10. Thus .replace('0','10') is applied and finally converted back into intger by type casting.

In [None]:
df_non_index['nth_year'] =[int(str(i)[3]) for i in (df_non_index.Date.dt.year)] # Note this is list comprehension 
df_non_index['nth_year'].replace(0,10,inplace=True)
df_non_index.head(10)

## Plotting the data using seaborn boxplot

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(20,15), dpi= 80)
sns.boxplot(x='Date', y='Monthly Mean Total Sunspot Number', data=df_non_index, ax=axes[0])
sns.boxplot(x='Month', y='Monthly Mean Total Sunspot Number', data=df_non_index,ax = axes[1])
sns.boxplot(x='nth_year', y='Monthly Mean Total Sunspot Number', data=df_non_index,ax = axes[2])
# Set Title
axes[0].set_title('Year-wise Box Plot\n(The Trend)', fontsize=14); 
axes[1].set_title('Month-wise Box Plot\n(The Seasonality)', fontsize=14)
axes[2].set_title('nth_year_each_decade\n(The Seasonality)', fontsize=14)
fig.tight_layout()
plt.show()

## Eplanation of above plot:

* The distribution of data is almost same in each month with few outliers
* The distribution of data among each year of the decades are not same .
## Returing back to dataframe 'df' and Making Date column as index

* Once we make Date column as index,it is very easy to slice the data based on index (i.e. date) and even plotting in pandas with datetime column as index is easy.

In [None]:
df = df.set_index('Date')
df.head()

In [None]:
df.tail()

In [None]:
df.plot(grid=True) # plots in pandas itself take index as x axis, here it is datetime and y axis is  'Monthly Mean Total Sunspot Number'
#  This plot is same to that of previous first box plot (that was a scatter plot, here it dots are joined )

## The data is too to large to see it in a one graph, there are 3235 monthly entries from date 1749-01-31 to 2018-07-31.

* One way to slice the data and visualise any particular time zone.
* Plotly express provide slider and button to select particular time zone.
* Checking both:

In [None]:
df_2018=df.loc['2000':'2010'] # Slicing all data from 2000 to 2010
df_2018.plot(figsize=(16,7),grid=True)

In [None]:
df_2018=df.loc['1900':'1920'] # Slicing all data from 1900 to 1910
df_2018.plot(figsize=(16,7),grid=True)
plt.show()

## plotly express

In [None]:
import plotly.express as px  
fig = px.line(df_non_index, x='Date', y='Monthly Mean Total Sunspot Number', title='Mean_Sunspot_Slider')
fig.update_xaxes(rangeslider_visible=False)
fig.show()
## There is slider belwo the graph using which we can select any particular time zone

## Buttons options in plotly

In [None]:
fig = px.line(df_non_index, x='Date', y='Monthly Mean Total Sunspot Number', title='Mean_Sunspot_Slider')

fig.update_xaxes(
    rangeslider_visible=False,
    rangeselector=dict(
        buttons=list([
            dict(count=10, label="10y", step="year", stepmode="backward"),
            dict(count=20, label="20y", step="year", stepmode="backward"),
            dict(count=30, label="30y", step="year", stepmode="backward"),
            dict(count=40, label="40y", step="year", stepmode="backward"),
            dict(count=50, label="50y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

In [None]:
df_non_index.head()

## Comparison of two consecutive 11 year: How to choose from where to where?
* In above graph we can see the pattern is repeating after 11 year approx, choose the time to match any two reapeated pattern

In [None]:
df_11_1985=df_non_index[(df_non_index.Date.dt.year>=1985) & (df_non_index.Date.dt.year<1996)]
df_11_1996=df_non_index[(df_non_index.Date.dt.year>=1996) &(df_non_index.Date.dt.year<2007)]

x=np.arange(1,len(df_11_1996['Date'])+1)

plt.plot(x, df_11_1985['Monthly Mean Total Sunspot Number'],label='df_60_1998')
plt.plot(x, df_11_1996['Monthly Mean Total Sunspot Number'],label='df_60_1958')
plt.legend()
plt.xlabel('Month')
plt.ylabel('Monthly Mean Total Sunspot Number')
plt.title('Comparison of Two consecutive 11 year')
plt.show()

## Lag plot

* It helps to understand the autocorrelation lag, visualizing for few, normally lag greater than 4 is not useful.
* As we increase the lag time, the correlation is decresing.
* The data is correlated with its recet time lag upt 4/5 time lag.

In [None]:
fig=plt.figure(figsize=(18,6))
fig.subplots_adjust(hspace=0.4, wspace=0.2)
ax1=fig.add_subplot(2,2,1)
pd.plotting.lag_plot(df['Monthly Mean Total Sunspot Number'],lag=1)
plt.title('Lag_1')
ax2=fig.add_subplot(2,2,2)
pd.plotting.lag_plot(df['Monthly Mean Total Sunspot Number'],lag=3)
plt.title('Lag_3')
ax3=fig.add_subplot(2,2,3)
pd.plotting.lag_plot(df['Monthly Mean Total Sunspot Number'],lag=6)
plt.title('Lag_6')
ax3=fig.add_subplot(2,2,4)
pd.plotting.lag_plot(df['Monthly Mean Total Sunspot Number'],lag=24)
plt.title('Lag_24')
plt.show()

## Checking the distribution by making histogram and kde plot

In [None]:
fig=plt.figure(figsize=(18,6))
fig.subplots_adjust(hspace=0.4, wspace=0.2)
ax1=fig.add_subplot(1,2,1)
df['Monthly Mean Total Sunspot Number'].hist()
plt.title('Histogram')
ax2=fig.add_subplot(1,2,2)
df['Monthly Mean Total Sunspot Number'].plot(kind='density')# kernel density plot
plt.title('KDE')
plt.show()

# 2. Checking Stationarity of Time Series Data
* From the plot of data we can see that the it is stationary, though we have to check it statistically. ### Check Stationarity of a Time Series

A TS is said to be stationary if its statistical properties such as mean, variance remain constant over time. But why is it important? Most of the TS models work on the assumption that the TS is stationary. Intuitively, we can state that if a TS has a particular behaviour over time, there is a very high probability that it will follow the same in the future. Also, the theories related to stationary series are more mature and easier to implement as compared to non-stationary series.

Stationarity is defined using very strict criterion. However, for practical purposes we can assume the series to be stationary if it has constant statistical properties over time, ie. the following:

* constant mean (For different time slots)
* constant variance (For different time slots)
* (Rolling mean/variance should be checked and should be constant)
* an autocovariance that does not depend on time
* Two test for stationarity: ADF & KPSS test

https://www.statsmodels.org/stable/examples/notebooks/generated/stationarity_detrending_adf_kpss.html

## Perform Augumented Dickey-Fuller test:
Dickey-Fuller Test: This is one of the statistical tests for checking stationarity. Here the null hypothesis is that the TS is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary.

* Null Hypothesis - Series is not stationary

* Alternate Hypothesis - Series is stationary

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
data_series=df['Monthly Mean Total Sunspot Number']

In [None]:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(data_series, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
if dfoutput['Test Statistic'] < dfoutput['Critical Value (5%)']:  ## Comparing with 5% significant Level
  print('Series is stationary')
else:
  print('Series is not Stationary')
## OR 
if dfoutput[1] > 0.05 :
  print('Series is not Stationary')
else:
  print('Series is Stationary')

## KPSS test for stationary: This another test
* Null hypothesis - Series is stationary
* Alternate hypothesis - Series is not stationary

In [None]:
from statsmodels.tsa.stattools import kpss

In [None]:
stats, p, lags, critical_values = kpss(df['Monthly Mean Total Sunspot Number'], 'c',nlags='legacy')
## pass --> 'ct' if there is trend component in data 
## pass --> 'c' if there is no trend component in data. In this case there is not trend in the data being stationary data.

In [None]:
print(f'Test Statistics: {stats}')
print(f'p-value: {p}')
print(f'Critial Values: {critical_values}')

if p < 0.05 :
  print('Series is not Stationary')
else:
  print('Series is Stationary')

**Note: For Non-Stationary data: First make it stationary**
* Differencing, Taking log and Differencing, Decompostion in components and detrending are few techniques are used.
# 3. Modelling Time Series
**There are many ways to model a time series in order to make predictions.Few are discussed here:**
* Different Moving Averages
* Exponential Smoothing
* ARIMA
* SARIMA
## Rolling Statistics:
We can plot the moving average or moving variance and see if it varies with time. By moving average/variance I mean that at any instant ‘t’, we’ll take the average/variance of the last year, i.e. last 12 months. But again this is more of a visual technique :

**Rolling Average OR Simple moving average = (t + (t-1) + (t-2) + ... + (t-n)) / n**

In [None]:
df['Monthly Mean Total Sunspot Number'][:200].plot() # Checking for only first 200 data set
df['Monthly Mean Total Sunspot Number'][:200].rolling(3).mean().plot(label='rolling mean') ## rolling average with 3 time step also known as window
#df['Monthly Mean Total Sunspot Number'][:200].rolling(3).std().plot(label='rolling std')
plt.legend()
plt.title('Rolling Mean & Standard Deviation')
## df['Monthly Mean Total Sunspot Number'].rolling(12).mean().shift(1) # Rolling mean with shift
plt.show()

## Weighted moving average
### Weighted moving average = (tweighting factor) + ((t-1)weighting factor-1) + ((t-n) * weighting factor-n)/n
* This is similar as rolling average except, we multiply with weighting factor so that more weight is given to recent data.
* this function is not availbele, we have to make our own

In [None]:
## Making a function for calculating weighted average which is passed through .apply()
def wma(weights): 
    def calc(x):
        return (weights*x).mean()
    return calc

In [None]:
df['Monthly Mean Total Sunspot Number'][:200].plot() # Checking for only first 200 data set
df['Monthly Mean Total Sunspot Number'][:200].rolling(3).apply(wma(np.array([0.5,1,1.5]))).plot(label='weighted mooving_averate')
#  Here inside wma 3 weights are passed since we are taking 3 time step only as window.
plt.legend()
plt.show()

## Exponential moving average\Exponential Smoothing


* https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average #### * Luckly there is a function for this in pandas.

In [None]:
df['Monthly Mean Total Sunspot Number'][:200].plot() # Checking for only first 200 data set
df['Monthly Mean Total Sunspot Number'][:200].ewm(span=3, adjust=False, min_periods=3).mean().plot(label='Exponential Weighted Average')
## Here span=3 is provide thus α=2/(span+1) automatically calculated and applied
## https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html
plt.title('Exponential Weighted M.A.')
plt.legend()
plt.show()

### Providing alpha for Smoothing

In [None]:
df['Monthly Mean Total Sunspot Number'][:200].plot() # Checking for only first 200 data set
df['Monthly Mean Total Sunspot Number'][:200].ewm(alpha=0.7, adjust=False, min_periods=3).mean().plot(label='Exponential Smooting M A')
plt.show()

### Plotting All together and comparing

In [None]:
df_with_diff_avg=df[:200].copy()
df_with_diff_avg['Rolling mean']=df['Monthly Mean Total Sunspot Number'][:200].rolling(3).mean()
df_with_diff_avg['W_M_A']= df['Monthly Mean Total Sunspot Number'][:200].rolling(window=3).apply(wma(np.array([0.5,1,1.5])))
df_with_diff_avg['E_W_A']= df['Monthly Mean Total Sunspot Number'][:200].ewm(span=3, adjust=False, min_periods=0).mean()
df_with_diff_avg['E_S_M_A']= df['Monthly Mean Total Sunspot Number'][:200].ewm(alpha=0.7, adjust=False, min_periods=3).mean()
print(df_with_diff_avg.head())
#df_with_diff_avg.set_index('Date', inplace=True)
df_with_diff_avg.plot()
plt.show()

In [None]:
df_with_diff_avg.dropna(inplace=True)

In [None]:
df_with_diff_avg.head()

### Making a function for comparing RMSE in all above modelling
* We can see exponential smoothing Moving average has lowest RMSE.


In [None]:
def RMSE_CAL(df):
      Rolling_Mean_RMSE=np.sqrt(np.sum((df.iloc[:,0]-df.iloc[:,1])**2))
      W_M_A_RMSE=np.sqrt(np.sum((df.iloc[:,0]-df.iloc[:,2])**2))
      E_W_A_RMSE=np.sqrt(np.sum((df.iloc[:,0]-df.iloc[:,3])**2))
      E_S_M_A_RMSE=np.sqrt(np.sum((df.iloc[:,0]-df.iloc[:,4])**2))
      return {"Rolling_Mean_RMSE":Rolling_Mean_RMSE,"W_M_A_RMSE":W_M_A_RMSE,"E_W_A_RMSE":E_W_A_RMSE,"E_S_M_A_RMSE":E_S_M_A_RMSE}
RMSE_CAL(df_with_diff_avg)

# 4. Decomposing a Time_Series Data

****NOTE: This operation is not required for this data as it is stationary but while working with non-stationary data this step may required.****
* Systematic: Components of the time series that have consistency or reocurrence and can be described and modeled as level,trend, seasonality.
* Non-Systematic: Components of the time series that cannot be directly modeled is noise/residual.
These components are defined as follows:

* Level: The average value in the series.
* Trend: The increasing or decreasing value in the series.
* Seasonality: The repeating short-term cycle in the series.
* Noise: The random variation in the series.
       
       So a time series is thought to be an aggregate or combination of these four components. 
       All series have a level and noise. The trend and seasonality components are optional. 
       It is helpful to think of the components as combining either additively or multiplicatively as given by relation below:

* y(t) = Level + Trend + Seasonality + Noise
* y(t) = Level * Trend * Seasonality * Noise

### Since our data is stationary we will use additive decomposition

In [None]:
# Additive decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df['Monthly Mean Total Sunspot Number'], model="additive",freq=11*12) # Data Trend is repeated after every 11 year,freq=11*12
result.plot()
plt.show()

### checking the definition of decompostion for additive nature time series data
* y(t) = Trend + Seasonality + Noise

In [None]:
total_sum=result.trend+result.seasonal+result.resid
total_sum[:100] # compare this result with original Sunspot data 

In [None]:
df['Monthly Mean Total Sunspot Number'][:100]

### Detrended Data :
* Since our data is additive in nature we are going to subtract the trend from observed value and get the detrended data:

In [None]:
pd.DataFrame(result.observed-result.trend).plot()
plt.show()

# 5. Autocorrelation plot
* We can assume the distribution of each variable fits a Gaussian (bell curve) distribution. If this is the case, we can use the Pearson’s correlation coefficient to summarize the correlation between the variables.

* The Pearson’s correlation coefficient is a number between -1 and 1 that describes a negative or positive correlation respectively. A value of zero indicates no correlation.

* We can calculate the correlation for time series observations with observations with previous time steps, called lags. Because the correlation of the time series observations is calculated with values of the same series at previous times, this is called a serial correlation, or an autocorrelation.

* A plot of the autocorrelation of a time series by lag is called the AutoCorrelation Function, or the acronym ACF. This plot is sometimes called a correlogram or an autocorrelation plot.

* This helps us to find if current value depends on previous values. In the plot you can observe that current value is dependent on previous 120-130 values. This can be around 10/11 years as it is monthly data.

In [None]:
pd.plotting.autocorrelation_plot(df['Monthly Mean Total Sunspot Number']) ## for each month
plt.show()

In [None]:
df['Monthly Mean Total Sunspot Number'].resample("1y").mean() ## Resample based on 1 year

In [None]:
pd.plotting.autocorrelation_plot(df['Monthly Mean Total Sunspot Number'].resample("1y").mean())
plt.show()

# ACF and PACF plots:
* Running the example creates a 2D plot showing the lag value along the x-axis and the correlation on the y-axis between -1 and 1.

* Confidence intervals are drawn as a cone. By default, this is set to a 95% confidence interval, suggesting that correlation values outside of this code are very likely a correlation and not a statistical fluke.

* acf: By looking at the plot we can improvise our understanding from above plot and say that present value depends on previous 25-30 values.

* pacf plot further says that present value depends only on previous 5/6 values. All these plots help us narrow down thinking and make our model efficient.

In [None]:
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
# Draw Plot
plot_acf(df['Monthly Mean Total Sunspot Number'].tolist(), lags=20, ax=axes[0])
plot_pacf(df['Monthly Mean Total Sunspot Number'].tolist(), lags=20, ax=axes[1])

# Auto ARIMA

In [None]:
!pip install pmdarima

In [None]:
import pmdarima as pm
from pmdarima.model_selection import train_test_split

model = pm.auto_arima(df['Monthly Mean Total Sunspot Number'], 
                        m=11, seasonal=True,
                      start_p=0, start_q=0, max_order=4, test='adf',error_action='ignore',  
                           suppress_warnings=True,
                      stepwise=True, trace=True) 
 ## actually we have to set m=11*12,but it take too much time and it doesnt matter much. Source Given below:
 # https://robjhyndman.com/hyndsight/longseasonality/


In [None]:
model.summary()

## Split tha data into train and test set

In [None]:
df.reset_index(inplace=True)

In [None]:
df.head()

In [None]:
train=df[(df.Date.dt.year<1958)]
test=df[(df.Date.dt.year>=1958)]

In [None]:
(df.Date.dt.year>=1958) & (df.Date.dt.year<1968)

In [None]:
test1=df[(df.Date.dt.year>=1958) & (df.Date.dt.year<1968)]
n=len(test1)

In [None]:
model.fit(train['Monthly Mean Total Sunspot Number'])


In [None]:
forecast=model.predict(n_periods=n, return_conf_int=True)

In [None]:
forecast_df = pd.DataFrame(forecast[0],index = test1.index,columns=['Prediction'])

In [None]:
pd.concat([df['Monthly Mean Total Sunspot Number'],forecast_df],axis=1).plot()

**The result may not seem accurate but, note that time series forecasting is not reasonable for many time step ahead. It may be valid only for 1,2 or few more time step ahead in future.**