In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split


In [4]:
# Load CSV
df = pd.read_csv("DWLR_Dataset_2023.csv")

# Clean column names
df.columns = df.columns.str.strip().str.replace('#','').str.replace(' ','_')

# Preview
print(df.head())
print(df.info())


FileNotFoundError: [Errno 2] No such file or directory: 'DWLR_Dataset_2023.csv'

In [5]:
# Convert date column
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')

# Fill missing values
df = df.fillna(method='ffill')

# Feature engineering
df['Month'] = df['Date'].dt.month
df['DayOfYear'] = df['Date'].dt.dayofyear


NameError: name 'df' is not defined

In [None]:
iso = IsolationForest(contamination=0.01, random_state=42)
df['anomaly'] = iso.fit_predict(df[['Water_Level']])

plt.figure(figsize=(12,5))
plt.plot(df['Date'], df['Water_Level'], label='Water Level')
plt.scatter(df['Date'][df['anomaly'] == -1],
            df['Water_Level'][df['anomaly'] == -1],
            color='red', label='Anomaly')
plt.legend()
plt.title("Anomaly Detection in Water Levels")
plt.show()


In [None]:
series = df.set_index('Date')['Water_Level']

# Train-test split (last 30 days for testing)
train = series.iloc[:-30]
test = series.iloc[-30:]

# Fit ARIMA model
model = ARIMA(train, order=(5,1,0))  # tune order later
model_fit = model.fit()

# Forecast
forecast = model_fit.forecast(steps=30)

# Plot results
plt.figure(figsize=(12,5))
plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, label='Test')
plt.plot(test.index, forecast, label='Forecast')
plt.legend()
plt.title("ARIMA Forecast of Water Levels")
plt.show()

# Accuracy
print("MAE:", mean_absolute_error(test, forecast))
print("RMSE:", np.sqrt(mean_squared_error(test, forecast)))


In [None]:
features = ['Rainfall_mm', 'Temperature', 'pH', 'Dissolved_', 'Month', 'DayOfYear']
X = df[features]
y = df['Water_Level']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train Random Forest
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
print("Random Forest MAE:", mean_absolute_error(y_test, y_pred))
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

# Plot predictions
plt.figure(figsize=(12,5))
plt.plot(y_test.index, y_test.values, label='Actual')
plt.plot(y_test.index, y_pred, label='Predicted')
plt.legend()
plt.title("Recharge Estimation with Random Forest")
plt.show()


In [None]:
importances = rf.feature_importances_
plt.bar(features, importances)
plt.title("Feature Importance in Groundwater Prediction")
plt.xticks(rotation=45)
plt.show()


In [None]:
print("✅ Prototype complete: Anomaly Detection + Forecasting + Recharge Estimation")
