In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [15, 9]
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/kaggle/input/sun-spot-counts/train.csv', parse_dates = ['Month'])
train.head()

In [None]:
train.tail()

In [None]:
train.duplicated().sum()

In [None]:
# We have no duplicate values in our series.

In [None]:
train.isnull().sum()

In [None]:
# Its good that we have no null values in our data.

In [None]:
df = pd.read_csv('/kaggle/input/sun-spot-counts/train.csv', parse_dates = ['Month'], index_col = ['Month'])
df.head()

In [None]:
test = pd.read_csv('/kaggle/input/sun-spot-counts/test.csv')
test.shape

In [None]:
df.plot()

In [None]:
# By looking at the plot we can say that series is somewhat stationary. So lets detrend the data & see the seasonality.

In [None]:
# Differencing for looking at seasonality.

df_1 = df.diff().dropna()

In [None]:
df_1.plot(title = 'First order differencing')

In [None]:
# By this graph we can say that there is sesonality for sure but we will see the period in ACF graph later.

#### Decomposition

In [None]:
## As we know there is no point in taking the data from very past so i will take the 120 years of data in order to
# predict 10 years of our test data.

In [None]:
final_df = df[1704:]

In [None]:
final_df.shape

In [None]:
final_df

In [None]:
# Lets take a look at our 120 years of data
final_df.plot()

In [None]:
# Yeah after looking at the data we can say that there has a seasonality. Lets check monthly.

In [None]:
# Monthly

sns.boxplot(x = final_df.index.month, y = final_df['Avg_sunspot_count'])
plt.title("Monthly data")

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
decomposition = seasonal_decompose(final_df, model = 'additive')
decomposition.plot();

In [None]:
final_df[final_df['Avg_sunspot_count'] == 0].count()

In [None]:
# replacing 0 values for multiplicative decomposition
df_2 = final_df['Avg_sunspot_count'].replace(to_replace = 0, value = 0.0001)

In [None]:
decomposition = seasonal_decompose(df_2,model = 'multiplicative')
decomposition.plot();

In [None]:
# Between both the graphs the series tends of have multiplicative model.

### SARIMA

#### Splitting the data

In [None]:
final_df.shape

In [None]:
# Taking 12 years for testing
train_S = final_df[:1200]
test_S = final_df[1200:]

In [None]:
train_S.shape

In [None]:
test_S.shape

In [None]:
train_S.tail()

In [None]:
test_S.tail()

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
obv = final_df.values
test_results = adfuller(obv)

In [None]:
print(f'ADF Statistics : {test_results[0]}')
print(f'p-values : {test_results[1]}')
print('Critical Values :')
for i, j in test_results[4].items():
    print('\t', i, j)

In [None]:
# After comparing ADF < 1% critical values so we can say with 99% confidence that the series is stationary.

In [None]:
from statsmodels.graphics.tsaplots     import  plot_pacf
from statsmodels.graphics.tsaplots     import  plot_acf

In [None]:
plot_acf(final_df, lags = 150);

In [None]:
# From the above graph we can say that there is seasonality of period = 64 (approx)

In [None]:
plot_pacf(final_df);

In [None]:
# As we have seasonality in our series we can surely go to SARIMA model.

In [None]:
import itertools
p = q = range(1, 4)
d = range(0,1)
pdq = list(itertools.product(p, d, q))

model_pdq = [(x[0], x[1], x[2], 6) for x in list(itertools.product(p, d, q))]
model_pdq

In [None]:
frame = pd.DataFrame(columns=['param','seasonal', 'AIC'])
frame

In [None]:
import statsmodels.api as sm
for param in pdq:
    for param_seasonal in model_pdq:
        mod = sm.tsa.statespace.SARIMAX(train_S,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False, # For AR parameter
                                            enforce_invertibility=False) # For MA parameter
            
        results_SARIMA = mod.fit()
        print('SARIMA{}x{}6 - AIC:{}'.format(param, param_seasonal, results_SARIMA.aic))
        frame = frame.append({'param':param,'seasonal':param_seasonal ,'AIC': results_SARIMA.aic}, ignore_index=True)

In [None]:
frame.sort_values(by=['AIC'])

In [None]:
import statsmodels.api as sm

model = sm.tsa.statespace.SARIMAX(train_S,
                                order = (1, 0, 3),
                                seasonal_order = (3, 0, 3, 6),
                                enforce_stationarity = False,
                                enforce_invertibility = False)

model_Sarima = model.fit()
print(model_Sarima.summary())

In [None]:
forecast = model_Sarima.forecast(len(test_S))

In [None]:
plt.plot(train_S,label='Training Data')
plt.plot(test_S,label='Test Data')
#plt.plot(test.index,final_df['Avg_sunspot_count'],label='Predicted Data - ARIMA')
plt.plot(test_S.index,forecast,label='Predicted Data - SARIMA')
plt.legend(loc='best')
plt.grid();

In [None]:
from math import sqrt
from sklearn.metrics import  mean_squared_error

rmse = sqrt(mean_squared_error(test_S.Avg_sunspot_count, forecast))
print(rmse)

### Regression

In [None]:
df_R = final_df.copy()

In [None]:
df_R['lag'] = df_R['Avg_sunspot_count'].shift(-1)

In [None]:
df_R['lag2'] = df_R['Avg_sunspot_count'].shift(-2)

In [None]:
# Extracting months
df_R['Month'] = df_R.index.month

In [None]:
df_R.head()

In [None]:
# Splitting seperately
# Taking 12 years for testing

train_R = df_R[:1200]
test_R = df_R[1200:]

In [None]:
len(train_R)

In [None]:
test_R.dropna(inplace=True)

In [None]:
df_R.dtypes

In [None]:
df_R.dropna(inplace = True)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(train_R[['Month','lag', 'lag2']],train_R['Avg_sunspot_count'])

In [None]:
test_predictions = lr.predict(test_R[['Month','lag', 'lag2']])
test_R['Forecast'] = test_predictions

In [None]:
plt.plot(train_R['Avg_sunspot_count'], label='Train')
plt.plot(test_R['Avg_sunspot_count'], label='Test')
plt.plot(test_R['Forecast'], label='Regression forecast')

plt.legend(loc='best')
plt.grid()

In [None]:
from math import sqrt
from sklearn.metrics import  mean_squared_error

rmse_R = sqrt(mean_squared_error(test_R.Avg_sunspot_count, test_R.Forecast, squared=False))
print(rmse_R)

### Creating data frame for storing RMSE scores.

In [None]:
d = [['SARIMA', rmse], ['Regression', rmse_R]]

score_df = pd.DataFrame(d, columns = ['Models', 'rmse_Scores'])

In [None]:
score_df

## Proceeding with the regression

In [None]:
df['lag'] = df['Avg_sunspot_count'].shift(-1)

In [None]:
# Extracting months
df['Month'] = df.index.month

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.dropna(inplace = True)

In [None]:
test_final = pd.read_csv('/kaggle/input/sun-spot-counts/test.csv', parse_dates = ['Month'], index_col = ['Month'])
test_final.head()

In [None]:
test_final['Avg_sunspot_count'] = 0

test_final['Month'] = test_final.index.month

In [None]:
test_final.tail()

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(df[['Month', 'lag']],df['Avg_sunspot_count'])

In [None]:
df.drop(['lag'], axis = 1, inplace = True)

In [None]:
df[120:]

In [None]:
t2 = pd.concat([df[-120:], test_final])

In [None]:
for i in range(120, len(t2)):
    t2['Avg_sunspot_count'][i] = lr.predict([[t2['Month'][i], t2['Avg_sunspot_count'][i-120]]])

In [None]:
t3 = t2[120:]

In [None]:
t3.drop('Month', axis = 1, inplace = True)

In [None]:
t3.plot()

In [None]:
#t3.to_csv('Submission.csv', date_format = '%m-%d-%Y')

### Proceeding with SARIMA

In [None]:
final_test = pd.read_csv('/kaggle/input/sun-spot-counts/test.csv', parse_dates = ['Month'])
final_test.head()

In [None]:
import itertools
p = q = range(1, 4)
d = range(0,1)
pdq = list(itertools.product(p, d, q))

model_pdq = [(x[0], x[1], x[2], 6) for x in list(itertools.product(p, d, q))]

In [None]:
frame = pd.DataFrame(columns=['param','seasonal', 'AIC'])
frame

In [None]:
df.drop('Month', axis = 1, inplace=True)

In [None]:
import statsmodels.api as sm

model = sm.tsa.statespace.SARIMAX(df,
                                order = (1, 0, 3),
                                seasonal_order = (3, 0, 3, 6),
                                enforce_stationarity = False,
                                enforce_invertibility = False)

model_Sarima = model.fit()
print(model_Sarima.summary())

In [None]:
#test_final.drop(['Avg_sunspot_count', 'Month'], axis = 1, inplace=True)

In [None]:
sarima_forecast = model_Sarima.forecast(len(test_final))

In [None]:
sarima_forecast.values

In [None]:
Sarima_df = pd.DataFrame(sarima_forecast.values, columns = df.columns)

In [None]:
Sarima_df

In [None]:
test_final.drop(['Month', 'Avg_sunspot_count'], axis = 1, inplace = True)

In [None]:
SARIMA = pd.concat([Sarima_df, test_final], ignore_index=True).head(120)

In [None]:
date = pd.date_range(start = '01/01/2011', end = '12/01/2020', freq = 'MS')
print(date)

In [None]:
SARIMA['Month'] = date

In [None]:
SARIMA = SARIMA[['Avg_sunspot_count', 'Month']]

In [None]:
#SARIMA.to_csv('SARIMA_SUB.csv')