In [1]:
import pandas as pd

# Load the dataset
df = pd.read_excel("..//datasets//modified_location_0.xlsx")  # Replace with your actual file path

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')  # Adjust format if necessary

# Drop rows with missing values in 'Rainfall' or 'Date'
df.dropna(subset=['Rainfall (in mm)'], inplace=True)

# Select only the 'Date' and 'Rainfall' columns
df = df[['Date', 'Rainfall (in mm)']]

# Set the 'Date' column as the index
df.set_index('Date', inplace=True)

# Convert Rainfall to time series (assuming daily data)
rainfall_ts = df['Rainfall (in mm)']

# Display the first few rows to check the data
print(df.head())

            Rainfall (in mm)
Date                        
1901-01-01               3.0
1901-01-02               8.0
1901-01-03               0.0
1901-01-04               0.0
1901-01-05               0.0


In [2]:
print(df.shape)  # (rows, columns)

(61600, 1)


In [3]:
from pmdarima import auto_arima

# Define training range
train_start = '2020-01-01'
train_end = '2022-12-31'

# Slice training data
y_train = rainfall_ts[train_start:train_end]

# Fit a lightweight seasonal ARIMA model
model = auto_arima(
    y_train,
    seasonal=True,
    m=90,  
    start_p=0, start_q=0, max_p=1, max_q=1, max_d=1,
    max_P=1, max_Q=1, max_D=1,
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    information_criterion='aic'
)

Performing stepwise search to minimize aic
 ARIMA(0,0,0)(1,0,1)[90] intercept   : AIC=17913.026, Time=26.03 sec
 ARIMA(0,0,0)(0,0,0)[90] intercept   : AIC=17933.416, Time=0.04 sec
 ARIMA(1,0,0)(1,0,0)[90] intercept   : AIC=16460.615, Time=31.43 sec
 ARIMA(0,0,1)(0,0,1)[90] intercept   : AIC=16205.483, Time=31.47 sec
 ARIMA(0,0,0)(0,0,0)[90]             : AIC=18553.047, Time=0.06 sec
 ARIMA(0,0,1)(0,0,0)[90] intercept   : AIC=16208.044, Time=0.34 sec
 ARIMA(0,0,1)(1,0,1)[90] intercept   : AIC=16207.460, Time=88.90 sec
 ARIMA(0,0,1)(1,0,0)[90] intercept   : AIC=16205.565, Time=34.91 sec
 ARIMA(0,0,0)(0,0,1)[90] intercept   : AIC=17911.027, Time=27.13 sec


MemoryError: Unable to allocate 142. MiB for an array with shape (2192, 92, 92) and data type float64

In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Forecast for 2023
forecast_start = '2023-01-01'
forecast_end = '2023-12-31'

# Extract actual values
y_true_2023 = df[(df.index >= forecast_start) & (df.index <= forecast_end)]['Rainfall (in mm)']

# Predict for the same period
n_periods = len(y_true_2023)
y_pred_2023 = model.predict(n_periods=n_periods)

# Check lengths
print("Actual data size:", len(y_true_2023))
print("Predicted data size:", len(y_pred_2023))

# Evaluation metrics
rmse_2023 = mean_squared_error(y_true_2023, y_pred_2023, squared=False)
mae_2023 = mean_absolute_error(y_true_2023, y_pred_2023)
r2_2023 = r2_score(y_true_2023, y_pred_2023)

print(f"RMSE for 2023 forecast: {rmse_2023}")
print(f"MAE for 2023 forecast: {mae_2023}")
print(f"R² for 2023 forecast: {r2_2023}")

# Create results DataFrame using index (dates) directly
forecast_df_2023 = pd.DataFrame({
    'Date': y_true_2023.index,                   # correct date index
    'Actual Rainfall': y_true_2023.values,       # actual values
    'Predicted Rainfall': y_pred_2023            # predicted values
})

# Save to Excel
forecast_df_2023.to_excel('..//datasets//2023_forecast_results_Sarima_(90_2years).xlsx', index=False)

print("✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx")


In [None]:
# now with m = 30

In [None]:
from pmdarima import auto_arima

# Define training range
train_start = '2018-01-01'
train_end = '2022-12-31'

# Slice training data
y_train = rainfall_ts[train_start:train_end]

# Fit a lightweight seasonal ARIMA model
model_30 = auto_arima(
    y_train,
    seasonal=True,
    m=45,  
    start_p=0, start_q=0, max_p=1, max_q=1, max_d=1,
    max_P=1, max_Q=1, max_D=1,
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    information_criterion='aic'
)

In [None]:
# Forecast for 2023
forecast_start = '2023-01-01'
forecast_end = '2023-12-31'

# Extract actual values
y_true_2023 = df[(df.index >= forecast_start) & (df.index <= forecast_end)]['Rainfall (in mm)']

# Predict for the same period
n_periods = len(y_true_2023)
y_pred_2023 = model_30.predict(n_periods=n_periods)

# Check lengths
print("Actual data size:", len(y_true_2023))
print("Predicted data size:", len(y_pred_2023))

# Evaluation metrics
rmse_2023 = mean_squared_error(y_true_2023, y_pred_2023, squared=False)
mae_2023 = mean_absolute_error(y_true_2023, y_pred_2023)
r2_2023 = r2_score(y_true_2023, y_pred_2023)

print(f"RMSE for 2023 forecast: {rmse_2023}")
print(f"MAE for 2023 forecast: {mae_2023}")
print(f"R² for 2023 forecast: {r2_2023}")

# Create results DataFrame using index (dates) directly
forecast_df_2023 = pd.DataFrame({
    'Date': y_true_2023.index,                   # correct date index
    'Actual Rainfall': y_true_2023.values,       # actual values
    'Predicted Rainfall': y_pred_2023            # predicted values
})

# Save to Excel
forecast_df_2023.to_excel('..//datasets//2023_forecast_results_Sarima_(45_2years).xlsx', index=False)

print("✅ Forecast saved successfully: 2023_forecast_results_Sarima.xlsx")
