In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

# Load the sunspot data from sunspot.csv
data = pd.read_csv('monthly-sunspots.csv')
data['Month'] = pd.to_datetime(data['Month'])
data.set_index('Month', inplace=True)
sunspots = data['Sunspots']

# Split the data into training and testing sets
train_data = sunspots.iloc[:int(0.8 * len(sunspots))]
test_data = sunspots.iloc[int(0.8 * len(sunspots)):]

# Define SARIMA orders
p, d, q = 1, 1, 1
P, D, Q, s = 1, 1, 1, 12  # Seasonal order with a period of 12 months

# Fit the SARIMA model
model = SARIMAX(train_data, order=(p, d, q), seasonal_order=(P, D, Q, s))
results = model.fit()

# Define the number of time steps to forecast into the future
n_steps = len(test_data)  # Forecast for the length of the test data

# Forecast future sunspots
forecast = results.get_forecast(steps=n_steps)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(test_data, forecast.predicted_mean))
print('RMSE:', rmse)

# Plot the actual and predicted sunspot activity
plt.figure(figsize=(12, 6))
plt.plot(sunspots, label='Actual Sunspot Activity')
plt.plot(forecast.predicted_mean, label='Predicted Sunspot Activity', color='red')
plt.xlabel('Date')
plt.ylabel('Sunspot Number')
plt.title('Actual vs. Predicted Sunspot Activity')
plt.legend()
plt.grid(True)
plt.show()


ValueError: Found input variables with inconsistent numbers of samples: [564, 565]