In [1]:
import pandas as pd

# Load the dataset
df = pd.read_excel("..//..//datasets//modified_location_3.xlsx")  # Replace with your actual file path

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')  # Adjust format if necessary

# Drop rows with missing values in 'Rainfall' or 'Date'
df.dropna(subset=['Rainfall (in mm)'], inplace=True)

# Select only the 'Date' and 'Rainfall' columns
df = df[['Date', 'Rainfall (in mm)']]

# Set the 'Date' column as the index
df.set_index('Date', inplace=True)

# Convert Rainfall to time series (assuming daily data)
rainfall_ts = df['Rainfall (in mm)']

# Display the first few rows to check the data
print(df.head())

            Rainfall (in mm)
Date                        
1901-01-01               3.0
1901-01-02               8.0
1901-01-03               0.0
1901-01-04               0.0
1901-01-05               0.0


In [2]:
print(df.shape)  # (rows, columns)

(61600, 1)


In [3]:
from pmdarima import auto_arima

# Define training range
train_start = '2018-01-01'
train_end = '2022-12-31'

# Slice training data
y_train = rainfall_ts[train_start:train_end]

# Fit a lightweight seasonal ARIMA model
model = auto_arima(
    y_train,
    seasonal=True,
    m=15,  
    start_p=0, start_q=0, max_p=1, max_q=1, max_d=1,
    max_P=1, max_Q=1, max_D=1,
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    information_criterion='aic'
)

Performing stepwise search to minimize aic
 ARIMA(0,0,0)(1,0,1)[15] intercept   : AIC=32165.869, Time=4.90 sec
 ARIMA(0,0,0)(0,0,0)[15] intercept   : AIC=32353.731, Time=0.08 sec
 ARIMA(1,0,0)(1,0,0)[15] intercept   : AIC=29484.371, Time=2.69 sec
 ARIMA(0,0,1)(0,0,1)[15] intercept   : AIC=29536.459, Time=6.40 sec
 ARIMA(0,0,0)(0,0,0)[15]             : AIC=33253.328, Time=0.05 sec
 ARIMA(1,0,0)(0,0,0)[15] intercept   : AIC=29483.583, Time=0.28 sec
 ARIMA(1,0,0)(0,0,1)[15] intercept   : AIC=29484.502, Time=0.98 sec
 ARIMA(1,0,0)(1,0,1)[15] intercept   : AIC=29471.511, Time=5.43 sec
 ARIMA(1,0,1)(1,0,1)[15] intercept   : AIC=29241.345, Time=34.44 sec
 ARIMA(1,0,1)(0,0,1)[15] intercept   : AIC=29271.444, Time=21.66 sec
 ARIMA(1,0,1)(1,0,0)[15] intercept   : AIC=29270.821, Time=4.62 sec
 ARIMA(1,0,1)(0,0,0)[15] intercept   : AIC=29273.305, Time=1.05 sec
 ARIMA(0,0,1)(1,0,1)[15] intercept   : AIC=29467.464, Time=43.79 sec
 ARIMA(1,0,1)(1,0,1)[15]             : AIC=inf, Time=10.13 sec

Best m

In [4]:
# Forecast for 2023
forecast_start = '2023-01-01'
forecast_end = '2023-12-31'

# Extract actual values
y_true_2023 = df[(df.index >= forecast_start) & (df.index <= forecast_end)]['Rainfall (in mm)']

# Predict for the same period
n_periods = len(y_true_2023)
y_pred_2023 = model.predict(n_periods=n_periods)

# Check lengths
print("Actual data size:", len(y_true_2023))
print("Predicted data size:", len(y_pred_2023))

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluation metrics
mse_2023 = mean_squared_error(y_true_2023, y_pred_2023)  # MSE instead of RMSE
mae_2023 = mean_absolute_error(y_true_2023, y_pred_2023)
r2_2023 = r2_score(y_true_2023, y_pred_2023)

print(f"MSE for 2023 forecast: {mse_2023}")
print(f"MAE for 2023 forecast: {mae_2023}")
print(f"R² for 2023 forecast: {r2_2023}")


# Create results DataFrame using index (dates) directly
forecast_df_2023 = pd.DataFrame({
    'Date': y_true_2023.index,                   # correct date index
    'Actual Rainfall': y_true_2023.values,       # actual values
    'Predicted Rainfall': y_pred_2023            # predicted values
})

# Save to Excel
forecast_df_2023.to_excel('.//sarima_location_3_(15_5yrs).xlsx', index=False)

print("✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx")


Actual data size:

  return get_prediction_index(
  return get_prediction_index(


 365
Predicted data size: 365
MSE for 2023 forecast: 124.73364804839665
MAE for 2023 forecast: 8.954448584047714
R² for 2023 forecast: -0.021516126832513693
✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx


In [5]:
# now with m = 30

In [6]:
from pmdarima import auto_arima

# Define training range
train_start = '2018-01-01'
train_end = '2022-12-31'

# Slice training data
y_train = rainfall_ts[train_start:train_end]

# Fit a lightweight seasonal ARIMA model
model_30 = auto_arima(
    y_train,
    seasonal=True,
    m=30,  
    start_p=0, start_q=0, max_p=1, max_q=1, max_d=1,
    max_P=1, max_Q=1, max_D=1,
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    information_criterion='aic'
)

Performing stepwise search to minimize aic
 ARIMA(0,0,0)(1,0,1)[30] intercept   : AIC=32199.754, Time=32.87 sec
 ARIMA(0,0,0)(0,0,0)[30] intercept   : AIC=32353.731, Time=0.09 sec
 ARIMA(1,0,0)(1,0,0)[30] intercept   : AIC=29472.387, Time=17.41 sec
 ARIMA(0,0,1)(0,0,1)[30] intercept   : AIC=29513.232, Time=20.31 sec
 ARIMA(0,0,0)(0,0,0)[30]             : AIC=33253.328, Time=0.06 sec
 ARIMA(1,0,0)(0,0,0)[30] intercept   : AIC=29483.583, Time=0.34 sec
 ARIMA(1,0,0)(1,0,1)[30] intercept   : AIC=29466.602, Time=42.13 sec
 ARIMA(1,0,0)(0,0,1)[30] intercept   : AIC=29473.510, Time=24.64 sec
 ARIMA(1,0,1)(1,0,1)[30] intercept   : AIC=29244.584, Time=68.96 sec
 ARIMA(1,0,1)(0,0,1)[30] intercept   : AIC=29253.269, Time=35.23 sec
 ARIMA(1,0,1)(1,0,0)[30] intercept   : AIC=29251.216, Time=46.61 sec
 ARIMA(1,0,1)(0,0,0)[30] intercept   : AIC=29273.305, Time=1.58 sec
 ARIMA(0,0,1)(1,0,1)[30] intercept   : AIC=29505.006, Time=55.53 sec
 ARIMA(1,0,1)(1,0,1)[30]             : AIC=inf, Time=51.92 sec



In [7]:
# Forecast for 2023
forecast_start = '2023-01-01'
forecast_end = '2023-12-31'

# Extract actual values
y_true_2023 = df[(df.index >= forecast_start) & (df.index <= forecast_end)]['Rainfall (in mm)']

# Predict for the same period
n_periods = len(y_true_2023)
y_pred_2023 = model_30.predict(n_periods=n_periods)

# Check lengths
print("Actual data size:", len(y_true_2023))
print("Predicted data size:", len(y_pred_2023))

# Evaluation metrics
mse_2023 = mean_squared_error(y_true_2023, y_pred_2023)
mae_2023 = mean_absolute_error(y_true_2023, y_pred_2023)
r2_2023 = r2_score(y_true_2023, y_pred_2023)

print(f"MSE for 2023 forecast: {mse_2023}")
print(f"MAE for 2023 forecast: {mae_2023}")
print(f"R² for 2023 forecast: {r2_2023}")

# Create results DataFrame using index (dates) directly
forecast_df_2023 = pd.DataFrame({
    'Date': y_true_2023.index,                   # correct date index
    'Actual Rainfall': y_true_2023.values,       # actual values
    'Predicted Rainfall': y_pred_2023            # predicted values
})

# Save to Excel
forecast_df_2023.to_excel('.//sarima_location_3_(30_5yrs.xlsx', index=False)

print("✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx")


  return get_prediction_index(
  return get_prediction_index(


Actual data size: 365
Predicted data size: 365
MSE for 2023 forecast: 126.0556257299928
MAE for 2023 forecast: 9.05331604222928
R² for 2023 forecast: -0.03234256815121195
✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx
