In [2]:
import pandas as pd

# Load the dataset
df = pd.read_excel("..//..//datasets//modified_location_6.xlsx")  # Replace with your actual file path

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')  # Adjust format if necessary

# Drop rows with missing values in 'Rainfall' or 'Date'
df.dropna(subset=['Rainfall (in mm)'], inplace=True)

# Select only the 'Date' and 'Rainfall' columns
df = df[['Date', 'Rainfall (in mm)']]

# Set the 'Date' column as the index
df.set_index('Date', inplace=True)

# Convert Rainfall to time series (assuming daily data)
rainfall_ts = df['Rainfall (in mm)']

# Display the first few rows to check the data
print(df.head())

            Rainfall (in mm)
Date                        
1901-01-01               3.0
1901-01-02               8.0
1901-01-03               0.0
1901-01-04               0.0
1901-01-05               0.0


In [2]:
print(df.shape)  # (rows, columns)

(61600, 1)


In [3]:
from pmdarima import auto_arima

# Define training range
train_start = '2018-01-01'
train_end = '2022-12-31'

# Slice training data
y_train = rainfall_ts[train_start:train_end]

# Fit a lightweight seasonal ARIMA model
model = auto_arima(
    y_train,
    seasonal=True,
    m=15,  
    start_p=0, start_q=0, max_p=1, max_q=1, max_d=1,
    max_P=1, max_Q=1, max_D=1,
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    information_criterion='aic'
)

Performing stepwise search to minimize aic
 ARIMA(0,0,0)(1,0,1)[15] intercept   : AIC=29340.100, Time=2.93 sec
 ARIMA(0,0,0)(0,0,0)[15] intercept   : AIC=29787.495, Time=0.06 sec
 ARIMA(1,0,0)(1,0,0)[15] intercept   : AIC=26571.171, Time=1.64 sec
 ARIMA(0,0,1)(0,0,1)[15] intercept   : AIC=26757.885, Time=3.32 sec
 ARIMA(0,0,0)(0,0,0)[15]             : AIC=31084.192, Time=0.05 sec
 ARIMA(1,0,0)(0,0,0)[15] intercept   : AIC=26572.883, Time=0.20 sec
 ARIMA(1,0,0)(1,0,1)[15] intercept   : AIC=26557.961, Time=4.00 sec
 ARIMA(1,0,0)(0,0,1)[15] intercept   : AIC=26571.452, Time=1.41 sec
 ARIMA(1,0,1)(1,0,1)[15] intercept   : AIC=26374.161, Time=9.94 sec
 ARIMA(1,0,1)(0,0,1)[15] intercept   : AIC=26409.929, Time=5.42 sec
 ARIMA(1,0,1)(1,0,0)[15] intercept   : AIC=26408.979, Time=2.33 sec
 ARIMA(1,0,1)(0,0,0)[15] intercept   : AIC=26415.097, Time=0.79 sec
 ARIMA(0,0,1)(1,0,1)[15] intercept   : AIC=26617.196, Time=7.02 sec
 ARIMA(1,0,1)(1,0,1)[15]             : AIC=inf, Time=3.10 sec

Best model

In [4]:
# Forecast for 2023
forecast_start = '2023-01-01'
forecast_end = '2023-12-31'

# Extract actual values
y_true_2023 = df[(df.index >= forecast_start) & (df.index <= forecast_end)]['Rainfall (in mm)']

# Predict for the same period
n_periods = len(y_true_2023)
y_pred_2023 = model.predict(n_periods=n_periods)

# Check lengths
print("Actual data size:", len(y_true_2023))
print("Predicted data size:", len(y_pred_2023))

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluation metrics
mse_2023 = mean_squared_error(y_true_2023, y_pred_2023)  # MSE instead of RMSE
mae_2023 = mean_absolute_error(y_true_2023, y_pred_2023)
r2_2023 = r2_score(y_true_2023, y_pred_2023)

print(f"MSE for 2023 forecast: {mse_2023}")
print(f"MAE for 2023 forecast: {mae_2023}")
print(f"R² for 2023 forecast: {r2_2023}")


# Create results DataFrame using index (dates) directly
forecast_df_2023 = pd.DataFrame({
    'Date': y_true_2023.index,                   # correct date index
    'Actual Rainfall': y_true_2023.values,       # actual values
    'Predicted Rainfall': y_pred_2023            # predicted values
})

# Save to Excel
forecast_df_2023.to_excel('.//sarima_location_6_(15_5yrs).xlsx', index=False)

print("✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx")


Actual data size: 365
Predicted data size: 365
MSE for 2023 forecast: 160.669086894852
MAE for 2023 forecast: 9.144315685753376
R² for 2023 forecast: 0.015934217776150894
✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx


  return get_prediction_index(
  return get_prediction_index(


In [5]:
# now with m = 30

In [3]:
from pmdarima import auto_arima

# Define training range
train_start = '2018-01-01'
train_end = '2022-12-31'

# Slice training data
y_train = rainfall_ts[train_start:train_end]

# Fit a lightweight seasonal ARIMA model
model_30 = auto_arima(
    y_train,
    seasonal=True,
    m=30,  
    start_p=0, start_q=0, max_p=1, max_q=1, max_d=1,
    max_P=1, max_Q=1, max_D=1,
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    information_criterion='aic'
)

Performing stepwise search to minimize aic
 ARIMA(0,0,0)(1,0,1)[30] intercept   : AIC=29470.691, Time=11.51 sec
 ARIMA(0,0,0)(0,0,0)[30] intercept   : AIC=29787.495, Time=0.07 sec
 ARIMA(1,0,0)(1,0,0)[30] intercept   : AIC=26568.858, Time=6.19 sec
 ARIMA(0,0,1)(0,0,1)[30] intercept   : AIC=26751.488, Time=7.36 sec
 ARIMA(0,0,0)(0,0,0)[30]             : AIC=31084.192, Time=0.04 sec
 ARIMA(1,0,0)(0,0,0)[30] intercept   : AIC=26572.883, Time=0.21 sec
 ARIMA(1,0,0)(1,0,1)[30] intercept   : AIC=26563.228, Time=11.30 sec
 ARIMA(1,0,0)(0,0,1)[30] intercept   : AIC=26569.456, Time=5.44 sec
 ARIMA(1,0,1)(1,0,1)[30] intercept   : AIC=26390.236, Time=13.97 sec
 ARIMA(1,0,1)(0,0,1)[30] intercept   : AIC=26403.195, Time=9.27 sec
 ARIMA(1,0,1)(1,0,0)[30] intercept   : AIC=26401.417, Time=8.49 sec
 ARIMA(1,0,1)(0,0,0)[30] intercept   : AIC=26415.097, Time=0.79 sec
 ARIMA(0,0,1)(1,0,1)[30] intercept   : AIC=26706.344, Time=13.46 sec
 ARIMA(1,0,1)(1,0,1)[30]             : AIC=inf, Time=26.37 sec

Best 

In [5]:
# Forecast for 2023
forecast_start = '2023-01-01'
forecast_end = '2023-12-31'

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Extract actual values
y_true_2023 = df[(df.index >= forecast_start) & (df.index <= forecast_end)]['Rainfall (in mm)']

# Predict for the same period
n_periods = len(y_true_2023)
y_pred_2023 = model_30.predict(n_periods=n_periods)

# Check lengths
print("Actual data size:", len(y_true_2023))
print("Predicted data size:", len(y_pred_2023))

# Evaluation metrics
mse_2023 = mean_squared_error(y_true_2023, y_pred_2023)
mae_2023 = mean_absolute_error(y_true_2023, y_pred_2023)
r2_2023 = r2_score(y_true_2023, y_pred_2023)

print(f"MSE for 2023 forecast: {mse_2023}")
print(f"MAE for 2023 forecast: {mae_2023}")
print(f"R² for 2023 forecast: {r2_2023}")

# Create results DataFrame using index (dates) directly
forecast_df_2023 = pd.DataFrame({
    'Date': y_true_2023.index,                   # correct date index
    'Actual Rainfall': y_true_2023.values,       # actual values
    'Predicted Rainfall': y_pred_2023            # predicted values
})

# Save to Excel
forecast_df_2023.to_excel('.//sarima_location_6_(30_5yrs.xlsx', index=False)

print("✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx")


Actual data size: 365
Predicted data size: 365
MSE for 2023 forecast: 161.47968848352963
MAE for 2023 forecast: 9.264380544066785
R² for 2023 forecast: 0.01096944638265962
✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx


  return get_prediction_index(
  return get_prediction_index(
