In [2]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA

In [18]:
# Load the dataset
data = pd.read_csv('daily_CCH_RF_data.csv')  


In [19]:
# Convert the date columns to datetime format
data['Date'] = pd.to_datetime(data[['Year', 'Month', 'Day']])

data

Unnamed: 0,Year,Month,Day,Rainfall,Date
0,1992,4,1,0.5,1992-04-01
1,1992,4,2,0.5,1992-04-02
2,1992,4,3,12.5,1992-04-03
3,1992,4,4,17,1992-04-04
4,1992,4,5,40,1992-04-05
...,...,...,...,...,...
11652,2024,2,25,0,2024-02-25
11653,2024,2,26,0,2024-02-26
11654,2024,2,27,0,2024-02-27
11655,2024,2,28,0,2024-02-28


In [20]:
# Convert 'Value' column to numeric with coerce to turn strings into NaN
numeric_values = pd.to_numeric(data['Rainfall'], errors='coerce')

# Count the number of NaN values, which represent strings
num_strings = numeric_values.isna().sum()

print("Number of string values in 'Rainfall' column:", num_strings)

string_records = data[pd.to_numeric(data['Rainfall'], errors='coerce').isna()]

print("String records in 'Rainfall' column:")
print(string_records)

Number of string values in 'Rainfall' column: 204
String records in 'Rainfall' column:
       Year  Month  Day Rainfall       Date
113    1992      7   23      *** 1992-07-23
346    1993      3   13      *** 1993-03-13
347    1993      3   14      *** 1993-03-14
348    1993      3   15      *** 1993-03-15
349    1993      3   16      *** 1993-03-16
...     ...    ...  ...      ...        ...
11141  2022     10    2      *** 2022-10-02
11142  2022     10    3      *** 2022-10-03
11143  2022     10    4      *** 2022-10-04
11144  2022     10    5      *** 2022-10-05
11145  2022     10    6      *** 2022-10-06

[204 rows x 5 columns]


In [22]:
# Convert 'Value' column to numeric with coerce to turn strings into NaN
numeric_values = pd.to_numeric(data['Rainfall'], errors='coerce')

# fill with previouse value fill missing values backward along columns
data['Rainfall'] = numeric_values.bfill(axis=0)

print("DataFrame with string records in 'Rainfall' column replaced with 0:")
print(data)

DataFrame with string records in 'Rainfall' column replaced with 0:
       Year  Month  Day  Rainfall       Date
0      1992      4    1       0.5 1992-04-01
1      1992      4    2       0.5 1992-04-02
2      1992      4    3      12.5 1992-04-03
3      1992      4    4      17.0 1992-04-04
4      1992      4    5      40.0 1992-04-05
...     ...    ...  ...       ...        ...
11652  2024      2   25       0.0 2024-02-25
11653  2024      2   26       0.0 2024-02-26
11654  2024      2   27       0.0 2024-02-27
11655  2024      2   28       0.0 2024-02-28
11656  2024      2   29       0.0 2024-02-29

[11657 rows x 5 columns]


In [23]:
# Aggregate the data by date and sum the rainfall values
daily_data = data.groupby('Date')['Rainfall'].sum().reset_index()
daily_data

Unnamed: 0,Date,Rainfall
0,1992-04-01,0.5
1,1992-04-02,0.5
2,1992-04-03,12.5
3,1992-04-04,17.0
4,1992-04-05,40.0
...,...,...
11652,2024-02-25,0.0
11653,2024-02-26,0.0
11654,2024-02-27,0.0
11655,2024-02-28,0.0


In [24]:
# Fit ARIMA model
model = ARIMA(daily_data['Rainfall'], order=(5,1,0))
model_fit = model.fit()

In [25]:
# Forecast for the next 7 days
forecast = model_fit.forecast(steps=7)

# Print the forecasted rainfall for the next 7 days
print("Forecasted rainfall for the next 7 days:")
print(forecast)

Forecasted rainfall for the next 7 days:
11657    1.447868e-50
11658    1.551170e-50
11659    1.463409e-50
11660    9.746621e-51
11661    9.269365e-51
11662    9.690423e-51
11663    1.137720e-50
Name: predicted_mean, dtype: float64


In [26]:
# Forecast for the next 14 days
forecast2 = model_fit.forecast(steps=14)

# Print the forecasted rainfall for the next 14 days
print("Forecasted rainfall for the next 14 days:")
print(forecast2)

Forecasted rainfall for the next 14 days:
11657    1.447868e-50
11658    1.551170e-50
11659    1.463409e-50
11660    9.746621e-51
11661    9.269365e-51
11662    9.690423e-51
11663    1.137720e-50
11664    1.163460e-50
11665    1.140659e-50
11666    1.082044e-50
11667    1.072296e-50
11668    1.081493e-50
11669    1.102027e-50
11670    1.106230e-50
Name: predicted_mean, dtype: float64


In [29]:
# Forecast for the next 180 days
forecast3 = model_fit.forecast(steps=100)

# Print the forecasted rainfall for the next 180 days
print("Forecasted rainfall for the next 180 days:")
print(forecast3)

Forecasted rainfall for the next 180 days:
11657    1.447868e-50
11658    1.551170e-50
11659    1.463409e-50
11660    9.746621e-51
11661    9.269365e-51
             ...     
11752    1.096875e-50
11753    1.096875e-50
11754    1.096875e-50
11755    1.096875e-50
11756    1.096875e-50
Name: predicted_mean, Length: 100, dtype: float64
