In [1]:
import pandas as pd

# Load the dataset
df = pd.read_excel("..//..//datasets//modified_location_2.xlsx")  # Replace with your actual file path

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')  # Adjust format if necessary

# Drop rows with missing values in 'Rainfall' or 'Date'
df.dropna(subset=['Rainfall (in mm)'], inplace=True)

# Select only the 'Date' and 'Rainfall' columns
df = df[['Date', 'Rainfall (in mm)']]

# Set the 'Date' column as the index
df.set_index('Date', inplace=True)

# Convert Rainfall to time series (assuming daily data)
rainfall_ts = df['Rainfall (in mm)']

# Display the first few rows to check the data
print(df.head())

            Rainfall (in mm)
Date                        
1901-01-01               4.0
1901-01-02               5.0
1901-01-03               0.0
1901-01-04               0.0
1901-01-05               0.0


In [2]:
print(df.shape)  # (rows, columns)

(61600, 1)


In [3]:
from pmdarima import auto_arima

# Define training range
train_start = '2018-01-01'
train_end = '2022-12-31'

# Slice training data
y_train = rainfall_ts[train_start:train_end]

# Fit a lightweight seasonal ARIMA model
model = auto_arima(
    y_train,
    seasonal=True,
    m=15,  
    start_p=0, start_q=0, max_p=1, max_q=1, max_d=1,
    max_P=1, max_Q=1, max_D=1,
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    information_criterion='aic'
)

Performing stepwise search to minimize aic
 ARIMA(0,0,0)(1,0,1)[15] intercept   : AIC=31800.430, Time=3.99 sec
 ARIMA(0,0,0)(0,0,0)[15] intercept   : AIC=32244.211, Time=0.07 sec
 ARIMA(1,0,0)(1,0,0)[15] intercept   : AIC=29714.162, Time=2.80 sec
 ARIMA(0,0,1)(0,0,1)[15] intercept   : AIC=29236.884, Time=5.21 sec
 ARIMA(0,0,0)(0,0,0)[15]             : AIC=33117.833, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[15] intercept   : AIC=29309.009, Time=0.46 sec
 ARIMA(0,0,1)(1,0,1)[15] intercept   : AIC=29121.347, Time=10.80 sec
 ARIMA(0,0,1)(1,0,0)[15] intercept   : AIC=29220.440, Time=3.05 sec
 ARIMA(1,0,1)(1,0,1)[15] intercept   : AIC=29061.547, Time=9.60 sec
 ARIMA(1,0,1)(0,0,1)[15] intercept   : AIC=29133.721, Time=8.57 sec
 ARIMA(1,0,1)(1,0,0)[15] intercept   : AIC=29126.418, Time=3.28 sec
 ARIMA(1,0,1)(0,0,0)[15] intercept   : AIC=29176.407, Time=0.93 sec
 ARIMA(1,0,0)(1,0,1)[15] intercept   : AIC=29701.157, Time=4.94 sec
 ARIMA(1,0,1)(1,0,1)[15]             : AIC=29157.841, Time=3.07 sec

Bes

In [4]:
# Forecast for 2023
forecast_start = '2023-01-01'
forecast_end = '2023-12-31'

# Extract actual values
y_true_2023 = df[(df.index >= forecast_start) & (df.index <= forecast_end)]['Rainfall (in mm)']

# Predict for the same period
n_periods = len(y_true_2023)
y_pred_2023 = model.predict(n_periods=n_periods)

# Check lengths
print("Actual data size:", len(y_true_2023))
print("Predicted data size:", len(y_pred_2023))

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluation metrics
mse_2023 = mean_squared_error(y_true_2023, y_pred_2023)  # MSE instead of RMSE
mae_2023 = mean_absolute_error(y_true_2023, y_pred_2023)
r2_2023 = r2_score(y_true_2023, y_pred_2023)

print(f"MSE for 2023 forecast: {mse_2023}")
print(f"MAE for 2023 forecast: {mae_2023}")
print(f"R² for 2023 forecast: {r2_2023}")


# Create results DataFrame using index (dates) directly
forecast_df_2023 = pd.DataFrame({
    'Date': y_true_2023.index,                   # correct date index
    'Actual Rainfall': y_true_2023.values,       # actual values
    'Predicted Rainfall': y_pred_2023            # predicted values
})

# Save to Excel
forecast_df_2023.to_excel('.//sarima_location_2_(15_5yrs).xlsx', index=False)

print("✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx")


Actual data size: 365
Predicted data size: 365
MSE for 2023 forecast: 296.0286985164192
MAE for 2023 forecast: 10.759328413619263
R² for 2023 forecast: 0.01624906815930549
✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx


  return get_prediction_index(
  return get_prediction_index(


In [5]:
# now with m = 30

In [6]:
from pmdarima import auto_arima

# Define training range
train_start = '2018-01-01'
train_end = '2022-12-31'

# Slice training data
y_train = rainfall_ts[train_start:train_end]

# Fit a lightweight seasonal ARIMA model
model_30 = auto_arima(
    y_train,
    seasonal=True,
    m=30,  
    start_p=0, start_q=0, max_p=1, max_q=1, max_d=1,
    max_P=1, max_Q=1, max_D=1,
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    information_criterion='aic'
)

Performing stepwise search to minimize aic
 ARIMA(0,0,0)(1,0,1)[30] intercept   : AIC=31970.575, Time=13.96 sec
 ARIMA(0,0,0)(0,0,0)[30] intercept   : AIC=32244.211, Time=0.09 sec
 ARIMA(1,0,0)(1,0,0)[30] intercept   : AIC=29717.028, Time=9.25 sec
 ARIMA(0,0,1)(0,0,1)[30] intercept   : AIC=29258.266, Time=7.97 sec
 ARIMA(0,0,0)(0,0,0)[30]             : AIC=33117.833, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[30] intercept   : AIC=29309.009, Time=0.51 sec
 ARIMA(0,0,1)(1,0,1)[30] intercept   : AIC=29211.419, Time=19.48 sec
 ARIMA(0,0,1)(1,0,0)[30] intercept   : AIC=29248.688, Time=8.46 sec
 ARIMA(1,0,1)(1,0,1)[30] intercept   : AIC=29129.379, Time=26.21 sec
 ARIMA(1,0,1)(0,0,1)[30] intercept   : AIC=29153.130, Time=16.19 sec
 ARIMA(1,0,1)(1,0,0)[30] intercept   : AIC=29149.693, Time=19.05 sec
 ARIMA(1,0,1)(0,0,0)[30] intercept   : AIC=29176.407, Time=0.86 sec
 ARIMA(1,0,0)(1,0,1)[30] intercept   : AIC=29713.881, Time=16.87 sec
 ARIMA(1,0,1)(1,0,1)[30]             : AIC=29293.986, Time=6.32 sec

In [7]:
# Forecast for 2023
forecast_start = '2023-01-01'
forecast_end = '2023-12-31'

# Extract actual values
y_true_2023 = df[(df.index >= forecast_start) & (df.index <= forecast_end)]['Rainfall (in mm)']

# Predict for the same period
n_periods = len(y_true_2023)
y_pred_2023 = model_30.predict(n_periods=n_periods)

# Check lengths
print("Actual data size:", len(y_true_2023))
print("Predicted data size:", len(y_pred_2023))

# Evaluation metrics
mse_2023 = mean_squared_error(y_true_2023, y_pred_2023)
mae_2023 = mean_absolute_error(y_true_2023, y_pred_2023)
r2_2023 = r2_score(y_true_2023, y_pred_2023)

print(f"MSE for 2023 forecast: {mse_2023}")
print(f"MAE for 2023 forecast: {mae_2023}")
print(f"R² for 2023 forecast: {r2_2023}")

# Create results DataFrame using index (dates) directly
forecast_df_2023 = pd.DataFrame({
    'Date': y_true_2023.index,                   # correct date index
    'Actual Rainfall': y_true_2023.values,       # actual values
    'Predicted Rainfall': y_pred_2023            # predicted values
})

# Save to Excel
forecast_df_2023.to_excel('.//sarima_location_2_(30_5yrs.xlsx', index=False)

print("✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx")


  return get_prediction_index(
  return get_prediction_index(


Actual data size: 365
Predicted data size: 365
MSE for 2023 forecast: 297.4155343099277
MAE for 2023 forecast: 10.979517615683697
R² for 2023 forecast: 0.011640389977050192
✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx
