In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
from datetime import datetime, timedelta, timezone

# Load the data
file_path = r'C:\Users\syounas\OneDrive - Enova Facilities Management\Tasks\GitHub\HubgradeDataCleaning\Sana\Data\Train_1.csv'
data = pd.read_csv(file_path)

# Rename columns for convenience
data.columns = ['ts', 'temp']

# Convert 'ts' column to datetime and handle timezone offset
data['ts'] = pd.to_datetime(data['ts'].str.replace(' Dubai', ''), errors='coerce', dayfirst=True)
# Drop rows where datetime parsing failed
data = data.dropna(subset=['ts'])

# Clean temperature column and convert to numeric
data['temp'] = data['temp'].str.replace('°C', '').astype(float)

# Separate data for temperature
df_temp = data[['ts', 'temp']].rename(columns={'ts': 'ds', 'temp': 'y'})

# Ensure 'ds' column is timezone-naive
df_temp['ds'] = df_temp['ds'].dt.tz_localize(None)

# Extract numerical features from datetime
df_temp['year'] = df_temp['ds'].dt.year
df_temp['month'] = df_temp['ds'].dt.month
df_temp['day'] = df_temp['ds'].dt.day
df_temp['hour'] = df_temp['ds'].dt.hour
df_temp['minute'] = df_temp['ds'].dt.minute

# Define dq_start
dq_start = datetime(2023, 3, 11, 8, 20, tzinfo=timezone.utc)

# Create future DataFrame starting from dq_start
future_periods = 200  # Number of future periods to predict
future_temp = pd.DataFrame()
future_temp['ds'] = [dq_start + timedelta(minutes=5*i) for i in range(future_periods)]

# Extract features for XGBoost
future_temp['year'] = future_temp['ds'].dt.year
future_temp['month'] = future_temp['ds'].dt.month
future_temp['day'] = future_temp['ds'].dt.day
future_temp['hour'] = future_temp['ds'].dt.hour
future_temp['minute'] = future_temp['ds'].dt.minute

# Initialize XGBoost model with parameters to reduce noise
model_temp = xgb.XGBRegressor(
    n_estimators=100,   # Number of boosting rounds
    max_depth=3,        # Maximum depth of each tree
    learning_rate=0.1,  # Learning rate
    min_child_weight=1, # Minimum sum of instance weight needed in a child
    subsample=0.8,      # Subsample ratio of the training instances
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
    objective='reg:squarederror'  # Objective function for regression task
)

# Fit the model
model_temp.fit(df_temp[['year', 'month', 'day', 'hour', 'minute']], df_temp['y'])

# Predict the future values
future_temp['yhat'] = model_temp.predict(future_temp[['year', 'month', 'day', 'hour', 'minute']])

# Calculate RMSE for the last 200 points in the training data
y_true = df_temp['y'].values[-200:]
y_pred = model_temp.predict(df_temp[['year', 'month', 'day', 'hour', 'minute']])[-200:]
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f"RMSE for the last 200 training points: {rmse}")

# Format the 'ds' column in future_temp to "2023-03-11T08:20:00+04:00 Dubai"
future_temp['ds'] = future_temp['ds'].dt.strftime('%Y-%m-%dT%H:%M:%S%z') + ' Dubai'

# Save only the predicted 200 future values (ds and yhat columns) to a CSV file
output_path = r'C:\Users\syounas\OneDrive - Enova Facilities Management\Tasks\GitHub\HubgradeDataCleaning\Sana\Data\xgboost_tune2.csv'
future_temp[['ds', 'yhat']].tail(200).to_csv(output_path, index=False)
