In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
#Loading dataset

In [None]:
data = pd.read_csv('final_data_in_ML.csv',index_col='Standardized_Date',parse_dates=True)
data

In [None]:
target = 'CLEAR WATER PUMPING FLOW ML'

In [None]:
# Create lag features for the target and other relevant variables
data['PUMPING_FLOW_LAG_1'] = data[target].shift(1)
data['PUMPING_FLOW_LAG_2'] = data[target].shift(2)
data['RAW_WATER_LAG_1'] = data['RAW WATER FLOW IN ML'].shift(1)
data['SUMP_LEVEL_LAG_1'] = data['CLEAR WATER SUMP LEVEL IN Meter'].shift(1)

# Create rolling mean features (e.g., 7-day rolling mean)
data['PUMPING_FLOW_ROLLING_MEAN_7'] = data[target].rolling(window=7).mean()

# Drop rows with NaN values caused by shifting and rolling
data.dropna(inplace=True)


In [None]:
# Define the cutoff for train-test split
train_size = int(len(data) * 0.8)

# Split into train and test sets
X = data[['PUMPING_FLOW_LAG_1', 'PUMPING_FLOW_LAG_2', 'RAW_WATER_LAG_1', 
          'SUMP_LEVEL_LAG_1', 'PUMPING_FLOW_ROLLING_MEAN_7']]
y = data[target]

X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Initialize the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')


In [None]:
import matplotlib.pyplot as plt

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.plot(data.index[train_size:], y_test, label='Actual')
plt.plot(data.index[train_size:], y_pred, label='Predicted')
plt.xlabel('Date')
plt.ylabel('Clear Water Pumping Flow (ML)')
plt.title('Actual vs Predicted - Clear Water Pumping Flow')
plt.legend()
plt.show()


In [None]:
# Print the predicted values
print("Predicted values:")
y_pred


In [None]:
# Create a DataFrame to show the actual and predicted values side by side
predictions_df = pd.DataFrame({
    'Date': data.index[train_size:],   # Dates from the test set
    'Actual': y_test.values,           # Actual values
    'Predicted': y_pred                # Predicted values
})

# Set the Date as the index
predictions_df.set_index('Date', inplace=True)

# Print the DataFrame with actual and predicted values
predictions_df
